From b6f0308e422d89a24cf71c28b7c9620abcbc38a1 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Sat, 28 May 2022 09:40:24 +0100 Subject: [PATCH] tidying script to run from cmd and via ssh --- UQ_Imbalance.py | 8 +- UQ_pnca_ML.py | 60 +++++-------- classification_params_FS.py | 131 +++++++++++++++++++++++++---- uq_ml_models_FS/scriptfsycm.py | 148 +++++++++++++++++++++++++++++---- 4 files changed, 271 insertions(+), 76 deletions(-) diff --git a/UQ_Imbalance.py b/UQ_Imbalance.py index 1f9d0b4..d471e86 100644 --- a/UQ_Imbalance.py +++ b/UQ_Imbalance.py @@ -117,15 +117,15 @@ print(len(X_enn)) #53 #------------------------------ # Determine categorical and numerical features -numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns +numerical_ix = X.select_dtypes(include=['int64', 'float64']).columns numerical_ix num_featuresL = list(numerical_ix) -numerical_colind = input_df.columns.get_indexer(list(numerical_ix) ) +numerical_colind = X.columns.get_indexer(list(numerical_ix) ) numerical_colind -categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns +categorical_ix = X.select_dtypes(include=['object', 'bool']).columns categorical_ix -categorical_colind = input_df.columns.get_indexer(list(categorical_ix)) +categorical_colind = X.columns.get_indexer(list(categorical_ix)) categorical_colind k_sm = 5 # 5 is deafult diff --git a/UQ_pnca_ML.py b/UQ_pnca_ML.py index 9041cca..d8fad0f 100644 --- a/UQ_pnca_ML.py +++ b/UQ_pnca_ML.py @@ -10,77 +10,57 @@ Created on Sun Mar 6 13:41:54 2022 import os, sys import pandas as pd import numpy as np +print(np.__version__) +print(pd.__version__) import pprint as pp from copy import deepcopy from sklearn import linear_model from sklearn import datasets from collections import Counter -from sklearn.linear_model import LogisticRegression, LinearRegression +from sklearn.linear_model import LogisticRegression, LogisticRegressionCV +from sklearn.linear_model import RidgeClassifier, RidgeClassifierCV, SGDClassifier, PassiveAggressiveClassifier + from sklearn.naive_bayes import BernoulliNB from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC -from sklearn.tree import DecisionTreeClassifier -from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier -from sklearn.ensemble import AdaBoostClassifier -from sklearn.ensemble import GradientBoostingClassifier -from sklearn.ensemble import BaggingClassifier +from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier +from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier from sklearn.naive_bayes import GaussianNB -from sklearn.gaussian_process import GaussianProcessClassifier -from sklearn.gaussian_process import kernels -from sklearn.gaussian_process.kernels import RBF -from sklearn.gaussian_process.kernels import DotProduct -from sklearn.gaussian_process.kernels import Matern -from sklearn.gaussian_process.kernels import RationalQuadratic -from sklearn.gaussian_process.kernels import WhiteKernel +from sklearn.gaussian_process import GaussianProcessClassifier, kernels +from sklearn.gaussian_process.kernels import RBF, DotProduct, Matern, RationalQuadratic, WhiteKernel -from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis +from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis from sklearn.neural_network import MLPClassifier -from sklearn.linear_model import RidgeClassifier, SGDClassifier, PassiveAggressiveClassifier -from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.svm import SVC from xgboost import XGBClassifier from sklearn.naive_bayes import MultinomialNB -from sklearn.linear_model import SGDClassifier from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder from sklearn.compose import ColumnTransformer from sklearn.compose import make_column_transformer -from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score -from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score -from sklearn.metrics import jaccard_score -from sklearn.metrics import make_scorer -from sklearn.metrics import classification_report +from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score +from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report -from sklearn.metrics import average_precision_score +from sklearn.model_selection import train_test_split, cross_validate, cross_val_score +from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold -from sklearn.model_selection import cross_validate -from sklearn.model_selection import train_test_split -from sklearn.model_selection import StratifiedKFold +from sklearn.pipeline import Pipeline, make_pipeline -from sklearn.pipeline import Pipeline -from sklearn.pipeline import make_pipeline +from sklearn.feature_selection import RFE, RFECV -from sklearn.feature_selection import RFE -from sklearn.feature_selection import RFECV import itertools -#import seaborn as sns +import seaborn as sns import matplotlib.pyplot as plt -import numpy as np -print(np.__version__) -print(pd.__version__) + from statistics import mean, stdev, median, mode from imblearn.over_sampling import RandomOverSampler from imblearn.under_sampling import RandomUnderSampler from imblearn.over_sampling import SMOTE -from imblearn.pipeline import Pipeline from sklearn.datasets import make_classification -from sklearn.model_selection import cross_validate, cross_val_score -from sklearn.model_selection import RepeatedStratifiedKFold -from sklearn.ensemble import AdaBoostClassifier from imblearn.combine import SMOTEENN from imblearn.combine import SMOTETomek @@ -124,8 +104,8 @@ os.chdir(homedir + "/git/ML_AI_training/") from MultClassPipe2 import MultClassPipeline2 from loopity_loop import MultClassPipeSKFLoop #from MultClassPipe3 import MultClassPipeSKFCV -from UQ_MultClassPipe4 import MultClassPipeSKFCV - +#from UQ_MultClassPipe4 import MultClassPipeSKFCV +from UQ_MultModelsCl import MultModelsCl #gene = 'pncA' #drug = 'pyrazinamide' diff --git a/classification_params_FS.py b/classification_params_FS.py index e5b4891..807878b 100644 --- a/classification_params_FS.py +++ b/classification_params_FS.py @@ -6,17 +6,7 @@ # autosklearn --> pipleine --> components --> classification # https://github.com/automl/auto-sklearn/tree/master/autosklearn/pipeline/components/classification -# TOADD: -# LDA -https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/lda.py -# Multinomial_nb -https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/multinomial_nb.py -# passive_aggressive -https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/passive_aggressive.py -# SGD -https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/sgd.py - - +# ADDED 27/05/2022: Extra Tree + LRCV and RCCV ######https://scikit-learn.org/stable/supervised_learning.html ######################################################################## @@ -57,7 +47,7 @@ param_grid_abc = [ #https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/extra_trees.py #https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html #====================== -estimator = ExtraTreesClassifier**rs) +estimator = ExtraTreesClassifier(**rs) # Define pipleline with steps pipe_abc = Pipeline([ @@ -85,6 +75,40 @@ param_grid_abc = [ } ] + +#====================== +# Extra TreeClassifier() + +https://scikit-learn.org/stable/modules/generated/sklearn.tree.ExtraTreeClassifier.html +#====================== +estimator = ExtraTreeClassifier(**rs) + +# Define pipleline with steps +pipe_abc = Pipeline([ + ('pre', MinMaxScaler()) + , ('fs', RFECV(DecisionTreeClassifier(**rs), cv = cv, scoring = 'matthews_corrcoef')) +# , ('fs', RFECV(estimator, cv = cv, scoring = 'matthews_corrcoef')) +# , ('clf', ExtraTreesClassifier(**rs))]) + , ('clf', estimator) + ]) + +# Define hyperparmeter space to search for +param_grid_abc = [ + { + 'fs__min_features_to_select' : [1,2] +# , 'fs__cv': [cv] + }, + +# 'clf': [ExtraTreeClassifier(**rs)], + 'clf__max_depth': [None], + 'clf__criterion': ['gini', 'entropy'], + 'clf__max_features': [None, 'sqrt', 'log2', 0.5, 1], + 'clf__min_samples_leaf': [1, 5, 10, 15, 20], + 'clf__min_samples_split': [2, 5, 10, 15, 20] + } +] + + #=========================== # DecisionTreeClassifier() https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/decision_tree.py @@ -304,8 +328,8 @@ param_grid_gbc = [ ######################################################################### #=========================== # GaussianNB() -https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/gaussian_nb.py -https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html +#https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/gaussian_nb.py +#https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html #=========================== # Define estimator estimator = GaussianNB() @@ -439,12 +463,58 @@ param_grid_lr = [ 'clf__solver': ['liblinear'] } +] + +######################################################################### +#=========================== +# LogisticRegressionCV () * +# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegressionCV.html +#=========================== +# Define estimator +estimator = LogisticRegressionCV(cv = 10, **rs) + +# Define pipleline with steps +pipe_lr = Pipeline([ + ('pre', MinMaxScaler()) + , ('fs', RFECV(LogisticRegression(**rs), cv = rskf_cv, scoring = 'matthews_corrcoef')) +# , ('fs', RFECV(estimator, cv = cv, scoring = 'matthews_corrcoef')) + , ('clf', estimator)]) + +# Define hyperparmeter space to search for +param_grid_lr = [ + + {'fs__min_features_to_select' : [1,2] +# , 'fs__cv': [rskf_cv] + }, + + { +# 'clf': [LogisticRegressionCV(cv = 10, **rs)], + 'clf__C': np.logspace(0, 4, 10), + 'clf__penalty': ['none', 'l1', 'l2', 'elasticnet'], + 'clf__max_iter': list(range(100,800,100)), + 'clf__solver': ['saga'] + }, + { +# 'clf': [LogisticRegressionCV(cv = 10, **rs)], + 'clf__C': np.logspace(0, 4, 10), + 'clf__penalty': ['l2', 'none'], + 'clf__max_iter': list(range(100,800,100)), + 'clf__solver': ['newton-cg', 'lbfgs', 'sag'] + }, + { +# 'clf': [LogisticRegressionCV(cv = 10, **rs)], + 'clf__C': np.logspace(0, 4, 10), + 'clf__penalty': ['l1', 'l2'], + 'clf__max_iter': list(range(100,800,100)), + 'clf__solver': ['liblinear'] + } + ] ######################################################################### #================== # MLPClassifier() -https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/mlp.py -https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html +#https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/mlp.py +#https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html #================== # Define estimator estimator = MLPClassifier(**rs) @@ -531,6 +601,35 @@ param_grid_rc = [ 'clf__alpha': [0.1, 0.2, 0.5, 0.8, 1.0] } ] + +####################################################################### +#==================== +# RidgeClassifier() * +https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RidgeClassifierCV.html +#==================== + +# Define estimator +estimator = RidgeClassifierCV(cv = 10, **rs) + +# Define pipleline with steps +pipe_rc = Pipeline([ + ('pre', MinMaxScaler()) + , ('fs', RFECV(DecisionTreeClassifier(**rs), cv = cv, scoring = 'matthews_corrcoef')) +# , ('fs', RFECV(estimator, cv = cv, scoring = 'matthews_corrcoef')) + , ('clf', estimator) + ]) + +param_grid_rc = [ + { + 'fs__min_features_to_select' : [1,2] +# , 'fs__cv': [cv] + }, + + { + #'clf' : [RidgeClassifierCV(cv = 10, **rs)], + 'clf__alpha': [0.1, 0.2, 0.5, 0.8, 1.0] + } +] ####################################################################### #======== # SVC() diff --git a/uq_ml_models_FS/scriptfsycm.py b/uq_ml_models_FS/scriptfsycm.py index 331f191..7934fcb 100644 --- a/uq_ml_models_FS/scriptfsycm.py +++ b/uq_ml_models_FS/scriptfsycm.py @@ -27,17 +27,42 @@ from sklearn.model_selection import train_test_split, cross_validate, cross_val_ # Metric from sklearn.metrics import mean_squared_error, make_scorer, roc_auc_score, f1_score, matthews_corrcoef, accuracy_score, balanced_accuracy_score, confusion_matrix, classification_report +# other vars +rs = {'random_state': 42} +njobs = {'n_jobs': 10} + +scoring_fn = ({'accuracy' : make_scorer(accuracy_score) + , 'fscore' : make_scorer(f1_score) + , 'mcc' : make_scorer(matthews_corrcoef) + , 'precision' : make_scorer(precision_score) + , 'recall' : make_scorer(recall_score) + , 'roc_auc' : make_scorer(roc_auc_score) + , 'jcc' : make_scorer(jaccard_score) + }) + +skf_cv = StratifiedKFold(n_splits = 10 + #, shuffle = False, random_state= None) + , shuffle = True,**rs) + +rskf_cv = RepeatedStratifiedKFold(n_splits = 10 + , n_repeats = 3 + , **rs) + +mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)} +jacc_score_fn = {'jcc': make_scorer(jaccard_score)} +#%% YC #def run_all_ML(input_pd, target_label, bts_input, bts_target, var_type): -def run_all_ML(input_pd, target_label, preprocess = True, var_type = 'numerical'): +def run_all_ML(input_pd, target_label, blind_test_input_df, blind_test_target, preprocess = True, var_type = 'numerical'): #y = input_pd[target_label] #X = input_pd.drop(target_label,axis=1) y = target_label X = input_pd - # determine categorical and numerical features - numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns + + # Determine categorical and numerical features + numerical_ix = input_pd.select_dtypes(include=['int64', 'float64']).columns numerical_ix - categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns + categorical_ix = input_pd.select_dtypes(include=['object', 'bool']).columns categorical_ix # Determine preprocessing steps ~ var_type @@ -53,17 +78,21 @@ def run_all_ML(input_pd, target_label, preprocess = True, var_type = 'numerical' col_transform = ColumnTransformer(transformers = t , remainder='passthrough') - result_pd = pd.DataFrame() + result_pd = pd.DataFrame() + result_bts_pd = pd.DataFrame() + #results_btsD = {} + results_all = {} + for name, algorithm in all_estimators(type_filter="classifier"): try: estmator = algorithm() temp_pd = pd.DataFrame() temp_cm = pd.DataFrame() - # orig - pipe = Pipeline([ - ("model" , algorithm()) - ]) + # # orig + # pipe = Pipeline([ + # ("model" , algorithm()) + # ]) # turn on and off preprocessing if preprocess == True: @@ -76,11 +105,17 @@ def run_all_ML(input_pd, target_label, preprocess = True, var_type = 'numerical' ("model" , algorithm()) ]) - - y_pred = cross_val_predict(pipe, X, y, cv = 10, n_jobs=10) - _mcc = round(matthews_corrcoef(y_pred, y), 3) - _bacc = round(balanced_accuracy_score(y_pred, y), 3) - _f1 = round(f1_score(y_pred, y), 3) + # cross val scores + y_pred = cross_val_predict(pipe, X, y, cv = 10, **njobs) +# CHANGE to cross_validate: ONLY THEN CAN YOU TRUST + # y_pred = cross_validate(pipe, X, y + # , cv = 10 + # , scoring = scoring_fn + # , **njobs) + + _mcc = round(matthews_corrcoef(y_pred, y), 3) + _bacc = round(balanced_accuracy_score(y_pred, y), 3) + _f1 = round(f1_score(y_pred, y), 3) _roc_auc = round(roc_auc_score(y_pred, y), 3) _tn, _fp, _fn, _tp = confusion_matrix(y_pred, y).ravel() @@ -88,7 +123,88 @@ def run_all_ML(input_pd, target_label, preprocess = True, var_type = 'numerical' columns=['estimator', 'TP', 'TN', 'FP', 'FN', 'roc_auc', 'matthew', 'bacc', 'f1']),\ ignore_index=True) + #========================= + # Blind test: BTS results + #========================= + #Build the final results with all scores for a feature selected model + pipe.fit(input_pd, target_label) + bts_predict = pipe.predict(blind_test_input_df) + + bts_mcc_score = round(matthews_corrcoef(blind_test_target, bts_predict),2) + print('\nMCC on Blind test:' , bts_mcc_score) + #print('\nAccuracy on Blind test:', round(accuracy_score(blind_test_target, bts_predict),2)) + + _mccBTS = round(matthews_corrcoef(bts_predict, blind_test_target), 3) + _baccBTS = round(balanced_accuracy_score(bts_predict, blind_test_target), 3) + _f1BTS = round(f1_score(bts_predict, blind_test_target), 3) + _roc_aucBTS = round(roc_auc_score(bts_predict, blind_test_target), 3) + _tnBTS, _fpBTS, _fnBTS, _tpBTS = confusion_matrix(bts_predict, blind_test_target).ravel() + + result_bts_pd = result_bts_pd.append(pd.DataFrame(np.column_stack([name + , _tpBTS, _tnBTS + , _fpBTS, _fnBTS + , _roc_aucBTS + , _mccBTS + , _baccBTS, _f1BTS]),\ + columns=['estimator', 'TP', 'TN', 'FP', 'FN', + 'roc_auc', 'matthew', 'bacc', 'f1']),\ + ignore_index=True) + + + results_all['CrossValResultsDF'] = result_pd + results_all['BlindTestResultsDF'] = result_bts_pd + except Exception as e: - print("Got an error while running {}".format(name)) + print("XXXGot an error while running {}".format(name)) print(e) - return(result_pd) + + + #return(result_pd) + return(results_all) + + +#%% CALL function +#run_all_ML(input_pd=X, target_label=y, blind_test_input_df=X_bts, blind_test_target=y_bts, preprocess = True, var_type = 'mixed') + +YC_resD2 = run_all_ML(input_pd=X, target_label=y, blind_test_input_df=X_bts, blind_test_target=y_bts, preprocess = True, var_type = 'mixed') + +YC_resD_ros = run_all_ML(input_pd=X_ros, target_label=y_ros, blind_test_input_df=X_bts, blind_test_target=y_bts, preprocess = True, var_type = 'mixed') + +CVResultsDF = YC_resD2['CrossValResultsDF'] +CVResultsDF.sort_values(by=['matthew'], ascending=False, inplace=True) +BTSResultsDF = YC_resD2['BlindTestResultsDF'] +BTSResultsDF.sort_values(by=['matthew'], ascending=False, inplace=True) + +# from sklearn.utils import all_estimators +# for name, algorithm in all_estimators(type_filter="classifier"): +# clf = algorithm() +# print('Name:', name, '\nAlgo:', clf) + +# Random Oversampling +YC_resD_ros = run_all_ML(input_pd=X_ros, target_label=y_ros, blind_test_input_df=X_bts, blind_test_target=y_bts, preprocess = True, var_type = 'mixed') +CVResultsDF_ros = YC_resD_ros['CrossValResultsDF'] +CVResultsDF_ros.sort_values(by=['matthew'], ascending=False, inplace=True) +BTSResultsDF_ros = YC_resD_ros['BlindTestResultsDF'] +BTSResultsDF_ros.sort_values(by=['matthew'], ascending=False, inplace=True) + +# Random Undersampling +YC_resD_rus = run_all_ML(input_pd=X_rus, target_label=y_rus, blind_test_input_df=X_bts, blind_test_target=y_bts, preprocess = True, var_type = 'mixed') +CVResultsDF_rus = YC_resD_rus['CrossValResultsDF'] +CVResultsDF_rus.sort_values(by=['matthew'], ascending=False, inplace=True) +BTSResultsDF_rus = YC_resD_rus['BlindTestResultsDF'] +BTSResultsDF_rus.sort_values(by=['matthew'], ascending=False, inplace=True) + +# Random Oversampling+Undersampling +YC_resD_rouC = run_all_ML(input_pd=X_rouC, target_label=y_rouC, blind_test_input_df=X_bts, blind_test_target=y_bts, preprocess = True, var_type = 'mixed') +CVResultsDF_rouC = YC_resD_rouC['CrossValResultsDF'] +CVResultsDF_rouC.sort_values(by=['matthew'], ascending=False, inplace=True) +BTSResultsDF_rouC = YC_resD_rouC['BlindTestResultsDF'] +BTSResultsDF_rouC.sort_values(by=['matthew'], ascending=False, inplace=True) + +# SMOTE NC +YC_resD_smnc = run_all_ML(input_pd=X_smnc, target_label=y_smnc, blind_test_input_df=X_bts, blind_test_target=y_bts, preprocess = True, var_type = 'mixed') +CVResultsDF_smnc = YC_resD_smnc['CrossValResultsDF'] +CVResultsDF_smnc.sort_values(by=['matthew'], ascending=False, inplace=True) +BTSResultsDF_smnc = YC_resD_smnc['BlindTestResultsDF'] +BTSResultsDF_smnc.sort_values(by=['matthew'], ascending=False, inplace=True) +