tidying script to run from cmd and via ssh

2022-05-28 09:40:24 +01:00 · 2022-05-28 09:40:24 +01:00 · b6f0308e42
commit b6f0308e42
parent 0a84a4b4dc
4 changed files with 271 additions and 76 deletions
--- a/UQ_Imbalance.py
+++ b/UQ_Imbalance.py
@ -117,15 +117,15 @@ print(len(X_enn)) #53

 #------------------------------
 # Determine categorical and numerical features
-numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
+numerical_ix = X.select_dtypes(include=['int64', 'float64']).columns
 numerical_ix
 num_featuresL = list(numerical_ix)
-numerical_colind = input_df.columns.get_indexer(list(numerical_ix) )
+numerical_colind = X.columns.get_indexer(list(numerical_ix) )
 numerical_colind

-categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns
+categorical_ix = X.select_dtypes(include=['object', 'bool']).columns
 categorical_ix    
-categorical_colind = input_df.columns.get_indexer(list(categorical_ix))
+categorical_colind = X.columns.get_indexer(list(categorical_ix))
 categorical_colind

 k_sm = 5 # 5 is deafult
--- a/UQ_pnca_ML.py
+++ b/UQ_pnca_ML.py
@ -10,77 +10,57 @@ Created on Sun Mar  6 13:41:54 2022
 import os, sys
 import pandas as pd
 import numpy as np
+print(np.__version__)
+print(pd.__version__)
 import pprint as pp
 from copy import deepcopy
 from sklearn import linear_model
 from sklearn import datasets
 from collections import Counter

-from sklearn.linear_model import LogisticRegression, LinearRegression
+from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
+from sklearn.linear_model import RidgeClassifier, RidgeClassifierCV, SGDClassifier, PassiveAggressiveClassifier
+
 from sklearn.naive_bayes import BernoulliNB
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.svm import SVC
-from sklearn.tree import DecisionTreeClassifier
-from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
-from sklearn.ensemble import AdaBoostClassifier
-from sklearn.ensemble import GradientBoostingClassifier
-from sklearn.ensemble import BaggingClassifier
+from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
+from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier
 from sklearn.naive_bayes import GaussianNB
-from sklearn.gaussian_process import GaussianProcessClassifier
-from sklearn.gaussian_process import kernels
-from sklearn.gaussian_process.kernels import RBF
-from sklearn.gaussian_process.kernels import DotProduct
-from sklearn.gaussian_process.kernels import Matern
-from sklearn.gaussian_process.kernels import RationalQuadratic
-from sklearn.gaussian_process.kernels import WhiteKernel
+from sklearn.gaussian_process import GaussianProcessClassifier, kernels
+from sklearn.gaussian_process.kernels import RBF, DotProduct, Matern, RationalQuadratic, WhiteKernel

-from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
+from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
 from sklearn.neural_network import MLPClassifier

-from sklearn.linear_model import RidgeClassifier, SGDClassifier, PassiveAggressiveClassifier
-from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
 from sklearn.svm import SVC
 from xgboost import XGBClassifier
 from sklearn.naive_bayes import MultinomialNB
-from sklearn.linear_model import SGDClassifier
 from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder

 from sklearn.compose import ColumnTransformer
 from sklearn.compose import make_column_transformer

-from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score 
-from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score
-from sklearn.metrics import jaccard_score
-from sklearn.metrics import make_scorer
-from sklearn.metrics import classification_report
+from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score
+from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report

-from sklearn.metrics import average_precision_score
+from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
+from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold

-from sklearn.model_selection import cross_validate
-from sklearn.model_selection import train_test_split
-from sklearn.model_selection import StratifiedKFold
+from sklearn.pipeline import Pipeline, make_pipeline

-from sklearn.pipeline import Pipeline
-from sklearn.pipeline import make_pipeline
+from sklearn.feature_selection import RFE, RFECV

-from sklearn.feature_selection import RFE
-from sklearn.feature_selection import RFECV
 import itertools
-#import seaborn as sns
+import seaborn as sns
 import matplotlib.pyplot as plt
-import numpy as np
-print(np.__version__)
-print(pd.__version__)
+
 from statistics import mean, stdev, median, mode

 from imblearn.over_sampling import RandomOverSampler
 from imblearn.under_sampling import RandomUnderSampler
 from imblearn.over_sampling import SMOTE
-from imblearn.pipeline import Pipeline
 from sklearn.datasets import make_classification
-from sklearn.model_selection import cross_validate, cross_val_score
-from sklearn.model_selection import RepeatedStratifiedKFold
-from sklearn.ensemble import AdaBoostClassifier
 from imblearn.combine import SMOTEENN
 from imblearn.combine import SMOTETomek

@ -124,8 +104,8 @@ os.chdir(homedir + "/git/ML_AI_training/")
 from MultClassPipe2 import MultClassPipeline2
 from loopity_loop import MultClassPipeSKFLoop
 #from MultClassPipe3 import MultClassPipeSKFCV
-from UQ_MultClassPipe4 import MultClassPipeSKFCV
-
+#from UQ_MultClassPipe4 import MultClassPipeSKFCV
+from UQ_MultModelsCl import MultModelsCl
 #gene = 'pncA'
 #drug = 'pyrazinamide'

--- a/classification_params_FS.py
+++ b/classification_params_FS.py
@ -6,17 +6,7 @@
 # autosklearn --> pipleine --> components --> classification
 # https://github.com/automl/auto-sklearn/tree/master/autosklearn/pipeline/components/classification

-# TOADD: 
-# LDA
-https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/lda.py
-# Multinomial_nb
-https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/multinomial_nb.py
-# passive_aggressive
-https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/passive_aggressive.py
-# SGD
-https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/sgd.py
-
-
+# ADDED 27/05/2022: Extra Tree + LRCV and RCCV
 ######https://scikit-learn.org/stable/supervised_learning.html

 ########################################################################
@ -57,7 +47,7 @@ param_grid_abc = [
 #https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/extra_trees.py
 #https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html
 #======================
-estimator =  ExtraTreesClassifier**rs)
+estimator =  ExtraTreesClassifier(**rs)

 # Define pipleline with steps
 pipe_abc = Pipeline([
@ -85,6 +75,40 @@ param_grid_abc = [
        }
 ]

+
+#======================
+# Extra TreeClassifier()
+
+https://scikit-learn.org/stable/modules/generated/sklearn.tree.ExtraTreeClassifier.html
+#======================
+estimator =  ExtraTreeClassifier(**rs)
+
+# Define pipleline with steps
+pipe_abc = Pipeline([
+    ('pre', MinMaxScaler())
+    , ('fs', RFECV(DecisionTreeClassifier(**rs), cv = cv, scoring = 'matthews_corrcoef'))
+#    , ('fs', RFECV(estimator, cv = cv, scoring = 'matthews_corrcoef'))    
+#    , ('clf',  ExtraTreesClassifier(**rs))])
+    , ('clf',  estimator)
+    ])
+
+# Define hyperparmeter space to search for
+param_grid_abc = [
+    {
+    'fs__min_features_to_select' : [1,2]
+#     , 'fs__cv': [cv]
+     },
+              
+#        'clf': [ExtraTreeClassifier(**rs)],
+         'clf__max_depth': [None],
+         'clf__criterion': ['gini', 'entropy'],
+         'clf__max_features': [None, 'sqrt', 'log2', 0.5, 1],
+         'clf__min_samples_leaf': [1, 5, 10, 15, 20],
+         'clf__min_samples_split': [2, 5, 10, 15, 20]
+        }
+]
+
+
 #===========================
 # DecisionTreeClassifier()
 https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/decision_tree.py
@ -304,8 +328,8 @@ param_grid_gbc = [
 #########################################################################
 #=========================== 
 # GaussianNB()
-https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/gaussian_nb.py
-https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html
+#https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/gaussian_nb.py
+#https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html
 #=========================== 
 # Define estimator
 estimator =  GaussianNB()
@ -439,12 +463,58 @@ param_grid_lr = [
        'clf__solver': ['liblinear']
    }

+]
+
+#########################################################################
+#=========================== 
+# LogisticRegressionCV () *
+# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegressionCV.html
+#=========================== 
+# Define estimator
+estimator =  LogisticRegressionCV(cv = 10, **rs)
+
+# Define pipleline with steps
+pipe_lr = Pipeline([
+    ('pre', MinMaxScaler())
+    , ('fs', RFECV(LogisticRegression(**rs), cv = rskf_cv, scoring = 'matthews_corrcoef'))
+#    , ('fs', RFECV(estimator, cv = cv, scoring = 'matthews_corrcoef'))       
+    , ('clf',  estimator)])
+
+# Define hyperparmeter space to search for
+param_grid_lr = [
+    
+    {'fs__min_features_to_select' : [1,2]
+#     , 'fs__cv': [rskf_cv]
+     },
+    
+    {
+#       'clf': [LogisticRegressionCV(cv = 10, **rs)],
+        'clf__C': np.logspace(0, 4, 10),
+        'clf__penalty': ['none', 'l1', 'l2', 'elasticnet'],
+        'clf__max_iter': list(range(100,800,100)),
+        'clf__solver': ['saga']
+    },
+    {
+#       'clf': [LogisticRegressionCV(cv = 10, **rs)],
+        'clf__C': np.logspace(0, 4, 10),
+        'clf__penalty': ['l2', 'none'],
+        'clf__max_iter': list(range(100,800,100)),
+        'clf__solver': ['newton-cg', 'lbfgs', 'sag']
+    }, 
+    {
+#       'clf': [LogisticRegressionCV(cv = 10, **rs)],
+        'clf__C': np.logspace(0, 4, 10),
+        'clf__penalty': ['l1', 'l2'],
+        'clf__max_iter': list(range(100,800,100)),
+        'clf__solver': ['liblinear']
+    }
+
 ]
 #########################################################################
 #================== 
 # MLPClassifier()
-https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/mlp.py
-https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html
+#https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/mlp.py
+#https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html
 #================== 
 # Define estimator
 estimator =  MLPClassifier(**rs)
@ -531,6 +601,35 @@ param_grid_rc = [
      'clf__alpha': [0.1, 0.2, 0.5, 0.8, 1.0]
     }
 ]
+
+#######################################################################
+#====================
+# RidgeClassifier() *
+https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RidgeClassifierCV.html
+#====================
+
+# Define estimator
+estimator =  RidgeClassifierCV(cv = 10, **rs)
+
+# Define pipleline with steps
+pipe_rc = Pipeline([
+    ('pre', MinMaxScaler())
+    , ('fs', RFECV(DecisionTreeClassifier(**rs), cv = cv, scoring = 'matthews_corrcoef'))
+#    , ('fs', RFECV(estimator, cv = cv, scoring = 'matthews_corrcoef'))
+    , ('clf',  estimator)
+    ])
+
+param_grid_rc = [
+    {
+    'fs__min_features_to_select' : [1,2]
+#     , 'fs__cv': [cv]
+     },
+    
+    {
+     #'clf' : [RidgeClassifierCV(cv = 10, **rs)],
+      'clf__alpha': [0.1, 0.2, 0.5, 0.8, 1.0]
+     }
+]
 #######################################################################
 #========
 # SVC()
--- a/uq_ml_models_FS/scriptfsycm.py
+++ b/uq_ml_models_FS/scriptfsycm.py
@ -27,17 +27,42 @@ from sklearn.model_selection import train_test_split, cross_validate, cross_val_
 # Metric
 from sklearn.metrics import mean_squared_error, make_scorer, roc_auc_score, f1_score, matthews_corrcoef, accuracy_score, balanced_accuracy_score, confusion_matrix, classification_report

+# other vars
+rs = {'random_state': 42}
+njobs = {'n_jobs': 10}
+
+scoring_fn =  ({'accuracy'      : make_scorer(accuracy_score)
+                 , 'fscore'     : make_scorer(f1_score)
+                 , 'mcc'        : make_scorer(matthews_corrcoef)
+                 , 'precision'  : make_scorer(precision_score)
+                 , 'recall'     : make_scorer(recall_score)
+                 , 'roc_auc'    : make_scorer(roc_auc_score)
+                 , 'jcc'        : make_scorer(jaccard_score)
+            }) 
+  
+skf_cv = StratifiedKFold(n_splits = 10
+                          #, shuffle = False, random_state= None)
+                           , shuffle = True,**rs)
+
+rskf_cv = RepeatedStratifiedKFold(n_splits = 10
+                                  , n_repeats = 3
+                                  , **rs)
+
+mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)}
+jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
+#%% YC 
 #def run_all_ML(input_pd, target_label, bts_input, bts_target, var_type):
-def run_all_ML(input_pd, target_label, preprocess = True, var_type = 'numerical'):
+def run_all_ML(input_pd, target_label, blind_test_input_df, blind_test_target, preprocess = True, var_type = 'numerical'):

    #y = input_pd[target_label]
    #X = input_pd.drop(target_label,axis=1)
    y = target_label
    X = input_pd
-    # determine categorical and numerical features
-    numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
+    
+    # Determine categorical and numerical features
+    numerical_ix = input_pd.select_dtypes(include=['int64', 'float64']).columns
    numerical_ix
-    categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns
+    categorical_ix = input_pd.select_dtypes(include=['object', 'bool']).columns
    categorical_ix    

    # Determine preprocessing steps ~ var_type
@ -54,16 +79,20 @@ def run_all_ML(input_pd, target_label, preprocess = True, var_type = 'numerical'
    col_transform = ColumnTransformer(transformers = t
                                       , remainder='passthrough')
    result_pd     = pd.DataFrame()
+    result_bts_pd = pd.DataFrame()
+    #results_btsD = {}
+    results_all = {}
+    
    for name, algorithm in all_estimators(type_filter="classifier"):
        try:
            estmator = algorithm()
            temp_pd = pd.DataFrame()
            temp_cm = pd.DataFrame()

-            # orig
-            pipe = Pipeline([
-                ("model"    , algorithm())
-            ])
+            # # orig
+            # pipe = Pipeline([
+            #     ("model"    , algorithm())
+            # ])
            
            # turn on and off preprocessing
            if preprocess == True:
@ -76,8 +105,14 @@ def run_all_ML(input_pd, target_label, preprocess = True, var_type = 'numerical'
                    ("model"    , algorithm())
                ])
                
+            # cross val scores
+            y_pred   = cross_val_predict(pipe, X, y, cv = 10, **njobs)
+# CHANGE to cross_validate: ONLY THEN CAN YOU TRUST
+            # y_pred   = cross_validate(pipe, X, y
+            #                           , cv = 10
+            #                           , scoring = scoring_fn
+            #                           , **njobs)

-            y_pred = cross_val_predict(pipe, X, y, cv = 10, n_jobs=10)
            _mcc     = round(matthews_corrcoef(y_pred, y), 3)
            _bacc    = round(balanced_accuracy_score(y_pred, y), 3)
            _f1      = round(f1_score(y_pred, y), 3)
@ -88,7 +123,88 @@ def run_all_ML(input_pd, target_label, preprocess = True, var_type = 'numerical'
                                                      columns=['estimator', 'TP', 'TN', 'FP', 'FN',
                                                               'roc_auc', 'matthew', 'bacc', 'f1']),\
                                         ignore_index=True)
+            #=========================
+            # Blind test: BTS results
+            #=========================
+            #Build the final results with all scores for a feature selected model
+            pipe.fit(input_pd, target_label)
+            bts_predict = pipe.predict(blind_test_input_df)
+
+            bts_mcc_score = round(matthews_corrcoef(blind_test_target, bts_predict),2)
+            print('\nMCC on Blind test:'     , bts_mcc_score)
+            #print('\nAccuracy on Blind test:', round(accuracy_score(blind_test_target, bts_predict),2))
+            
+            _mccBTS     = round(matthews_corrcoef(bts_predict, blind_test_target), 3)
+            _baccBTS    = round(balanced_accuracy_score(bts_predict, blind_test_target), 3)
+            _f1BTS      = round(f1_score(bts_predict, blind_test_target), 3)
+            _roc_aucBTS = round(roc_auc_score(bts_predict, blind_test_target), 3)
+            _tnBTS, _fpBTS, _fnBTS, _tpBTS = confusion_matrix(bts_predict, blind_test_target).ravel()
+            
+            result_bts_pd = result_bts_pd.append(pd.DataFrame(np.column_stack([name
+                                                                            , _tpBTS, _tnBTS
+                                                                            , _fpBTS, _fnBTS
+                                                                            , _roc_aucBTS
+                                                                            , _mccBTS
+                                                                            , _baccBTS, _f1BTS]),\
+                                                      columns=['estimator', 'TP', 'TN', 'FP', 'FN',
+                                                                'roc_auc', 'matthew', 'bacc', 'f1']),\
+                                          ignore_index=True)
+      
+            
+            results_all['CrossValResultsDF']   = result_pd
+            results_all['BlindTestResultsDF']  = result_bts_pd
+
        except Exception as e:
-            print("Got an error while running {}".format(name))
+            print("XXXGot an error while running {}".format(name))
            print(e)
-    return(result_pd)
+            
+            
+    #return(result_pd)    
+    return(results_all)
+    
+
+#%% CALL function
+#run_all_ML(input_pd=X, target_label=y, blind_test_input_df=X_bts, blind_test_target=y_bts, preprocess = True, var_type = 'mixed')
+
+YC_resD2 = run_all_ML(input_pd=X, target_label=y, blind_test_input_df=X_bts, blind_test_target=y_bts, preprocess = True, var_type = 'mixed')
+
+YC_resD_ros = run_all_ML(input_pd=X_ros, target_label=y_ros, blind_test_input_df=X_bts, blind_test_target=y_bts, preprocess = True, var_type = 'mixed')
+
+CVResultsDF = YC_resD2['CrossValResultsDF']
+CVResultsDF.sort_values(by=['matthew'], ascending=False, inplace=True)
+BTSResultsDF = YC_resD2['BlindTestResultsDF']
+BTSResultsDF.sort_values(by=['matthew'], ascending=False, inplace=True)
+
+# from sklearn.utils import all_estimators
+# for name, algorithm in all_estimators(type_filter="classifier"):
+#     clf = algorithm()
+#     print('Name:', name, '\nAlgo:', clf)
+
+# Random Oversampling
+YC_resD_ros = run_all_ML(input_pd=X_ros, target_label=y_ros, blind_test_input_df=X_bts, blind_test_target=y_bts, preprocess = True, var_type = 'mixed')
+CVResultsDF_ros = YC_resD_ros['CrossValResultsDF']
+CVResultsDF_ros.sort_values(by=['matthew'], ascending=False, inplace=True)
+BTSResultsDF_ros = YC_resD_ros['BlindTestResultsDF']
+BTSResultsDF_ros.sort_values(by=['matthew'], ascending=False, inplace=True)
+
+# Random Undersampling
+YC_resD_rus = run_all_ML(input_pd=X_rus, target_label=y_rus, blind_test_input_df=X_bts, blind_test_target=y_bts, preprocess = True, var_type = 'mixed')
+CVResultsDF_rus = YC_resD_rus['CrossValResultsDF']
+CVResultsDF_rus.sort_values(by=['matthew'], ascending=False, inplace=True)
+BTSResultsDF_rus = YC_resD_rus['BlindTestResultsDF']
+BTSResultsDF_rus.sort_values(by=['matthew'], ascending=False, inplace=True)
+
+# Random Oversampling+Undersampling
+YC_resD_rouC = run_all_ML(input_pd=X_rouC, target_label=y_rouC, blind_test_input_df=X_bts, blind_test_target=y_bts, preprocess = True, var_type = 'mixed')
+CVResultsDF_rouC = YC_resD_rouC['CrossValResultsDF']
+CVResultsDF_rouC.sort_values(by=['matthew'], ascending=False, inplace=True)
+BTSResultsDF_rouC = YC_resD_rouC['BlindTestResultsDF']
+BTSResultsDF_rouC.sort_values(by=['matthew'], ascending=False, inplace=True)
+
+# SMOTE NC
+YC_resD_smnc = run_all_ML(input_pd=X_smnc, target_label=y_smnc, blind_test_input_df=X_bts, blind_test_target=y_bts, preprocess = True, var_type = 'mixed')
+CVResultsDF_smnc = YC_resD_smnc['CrossValResultsDF']
+CVResultsDF_smnc.sort_values(by=['matthew'], ascending=False, inplace=True)
+BTSResultsDF_smnc = YC_resD_smnc['BlindTestResultsDF']
+BTSResultsDF_smnc.sort_values(by=['matthew'], ascending=False, inplace=True)
+