#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Sun May 29 07:43:21 2022 @author: tanu """ import pandas as pd import numpy as np import scipy as sp import time import sys import os import re import argparse from math import sqrt from scipy import stats import joblib # Alogorithm from xgboost.sklearn import XGBClassifier from sklearn import svm from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier from sklearn.gaussian_process import GaussianProcessClassifier from sklearn.ensemble import AdaBoostClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.neural_network import MLPRegressor from sklearn.utils import all_estimators # Pre-processing from sklearn import preprocessing from sklearn.preprocessing import StandardScaler from sklearn.datasets import make_classification from sklearn.pipeline import Pipeline, make_pipeline from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, LeaveOneOut, KFold, RepeatedKFold, cross_val_predict # Metric from sklearn.metrics import mean_squared_error, make_scorer, roc_auc_score, f1_score, matthews_corrcoef, accuracy_score, balanced_accuracy_score, confusion_matrix, classification_report ############################################################################### # TT imports from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report from sklearn.model_selection import train_test_split, cross_validate, cross_val_score from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold from copy import deepcopy from sklearn import linear_model from sklearn import datasets from collections import Counter from sklearn.linear_model import LogisticRegression, LogisticRegressionCV from sklearn.linear_model import RidgeClassifier, RidgeClassifierCV, SGDClassifier, PassiveAggressiveClassifier from sklearn.naive_bayes import BernoulliNB from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier from sklearn.naive_bayes import GaussianNB from sklearn.gaussian_process import GaussianProcessClassifier, kernels from sklearn.gaussian_process.kernels import RBF, DotProduct, Matern, RationalQuadratic, WhiteKernel from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis from sklearn.neural_network import MLPClassifier from sklearn.svm import SVC from xgboost import XGBClassifier from sklearn.naive_bayes import MultinomialNB from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder from sklearn.compose import ColumnTransformer from sklearn.compose import make_column_transformer from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report from sklearn.model_selection import train_test_split, cross_validate, cross_val_score from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold from sklearn.pipeline import Pipeline, make_pipeline from sklearn.feature_selection import RFE, RFECV import itertools import seaborn as sns import matplotlib.pyplot as plt from statistics import mean, stdev, median, mode from imblearn.over_sampling import RandomOverSampler from imblearn.under_sampling import RandomUnderSampler from imblearn.over_sampling import SMOTE from sklearn.datasets import make_classification from imblearn.combine import SMOTEENN from imblearn.combine import SMOTETomek from imblearn.over_sampling import SMOTENC from imblearn.under_sampling import EditedNearestNeighbours from imblearn.under_sampling import RepeatedEditedNearestNeighbours from sklearn.model_selection import GridSearchCV from sklearn.base import BaseEstimator from sklearn.impute import KNNImputer as KNN import json ############################################################################## # other vars rs = {'random_state': 42} njobs = {'n_jobs': 10} scoring_fn = ({'accuracy' : make_scorer(accuracy_score) , 'fscore' : make_scorer(f1_score) , 'mcc' : make_scorer(matthews_corrcoef) , 'precision' : make_scorer(precision_score) , 'recall' : make_scorer(recall_score) , 'roc_auc' : make_scorer(roc_auc_score) , 'jcc' : make_scorer(jaccard_score) }) skf_cv = StratifiedKFold(n_splits = 10 #, shuffle = False, random_state= None) , shuffle = True,**rs) rskf_cv = RepeatedStratifiedKFold(n_splits = 10 , n_repeats = 3 , **rs) mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)} jacc_score_fn = {'jcc': make_scorer(jaccard_score)} #%% YC #def run_all_ML(input_pd, target_label, bts_input, bts_target, var_type): def run_all_ML(input_pd, target_label, blind_test_input_df, blind_test_target, preprocess = True, var_type = 'numerical'): #y = input_pd[target_label] #X = input_pd.drop(target_label,axis=1) y = target_label X = input_pd # Determine categorical and numerical features numerical_ix = input_pd.select_dtypes(include=['int64', 'float64']).columns numerical_ix categorical_ix = input_pd.select_dtypes(include=['object', 'bool']).columns categorical_ix # Determine preprocessing steps ~ var_type if var_type == 'numerical': t = [('num', MinMaxScaler(), numerical_ix)] if var_type == 'categorical': t = [('cat', OneHotEncoder(), categorical_ix)] if var_type == 'mixed': t = [('num', MinMaxScaler(), numerical_ix) , ('cat', OneHotEncoder(), categorical_ix)] col_transform = ColumnTransformer(transformers = t , remainder='passthrough') result_pd = pd.DataFrame() result_bts_pd = pd.DataFrame() #results_btsD = {} results_all = {} for name, algorithm in all_estimators(type_filter="classifier"): try: estmator = algorithm() temp_pd = pd.DataFrame() temp_cm = pd.DataFrame() # # orig # pipe = Pipeline([ # ("model" , algorithm()) # ]) # turn on and off preprocessing if preprocess == True: pipe = Pipeline([ ('prep' , col_transform), ("model" , algorithm()) ]) else: pipe = Pipeline([ ("model" , algorithm()) ]) # cross val scores y_pred = cross_val_predict(pipe, X, y, cv = 10, **njobs) # CHANGE to cross_validate: ONLY THEN CAN YOU TRUST # y_pred = cross_validate(pipe, X, y # , cv = 10 # , scoring = scoring_fn # , **njobs) _mcc = round(matthews_corrcoef(y_pred, y), 3) _bacc = round(balanced_accuracy_score(y_pred, y), 3) _f1 = round(f1_score(y_pred, y), 3) _roc_auc = round(roc_auc_score(y_pred, y), 3) _tn, _fp, _fn, _tp = confusion_matrix(y_pred, y).ravel() print('\nMCC on CV:', round(matthews_corrcoef(y_pred, y), 3)) # result_pd = result_pd.append(pd.DataFrame(np.column_stack([name # , _tp, _tn # , _fp , _fn # , _roc_auc # , _mcc # , _bacc, _f1]),\ # columns=['estimator', 'TP', 'TN', 'FP', 'FN', # 'roc_auc', 'matthew', 'bacc', 'f1']),\ # ignore_index=True) result_pd = result_pd.append(pd.DataFrame(np.column_stack([name , _mcc , _roc_auc , _bacc, _f1 , _tp, _tn , _fp , _fn]),\ columns=['estimator', 'matthew', 'roc_auc', 'bacc', 'f1',\ 'TP', 'TN', 'FP', 'FN']),\ ignore_index=True) #========================= # Blind test: BTS results #========================= #Build the final results with all scores for a feature selected model pipe.fit(input_pd, target_label) bts_predict = pipe.predict(blind_test_input_df) bts_mcc_score = round(matthews_corrcoef(blind_test_target, bts_predict),2) print('\nMCC on Blind test:' , bts_mcc_score) #print('\nAccuracy on Blind test:', round(accuracy_score(blind_test_target, bts_predict),2)) _mccBTS = round(matthews_corrcoef(bts_predict, blind_test_target), 3) _baccBTS = round(balanced_accuracy_score(bts_predict, blind_test_target), 3) _f1BTS = round(f1_score(bts_predict, blind_test_target), 3) _roc_aucBTS = round(roc_auc_score(bts_predict, blind_test_target), 3) _tnBTS, _fpBTS, _fnBTS, _tpBTS = confusion_matrix(bts_predict, blind_test_target).ravel() # result_bts_pd = result_bts_pd.append(pd.DataFrame(np.column_stack([name # , _tpBTS, _tnBTS # , _fpBTS, _fnBTS # , _roc_aucBTS # , _mccBTS # , _baccBTS, _f1BTS]),\ # columns=['estimator', 'TP', 'TN', 'FP', 'FN', # 'roc_auc', 'matthew', 'bacc', 'f1']),\ # ignore_index=True) result_bts_pd = result_bts_pd.append(pd.DataFrame(np.column_stack([name , _mccBTS , _roc_aucBTS , _baccBTS, _f1BTS , _tpBTS, _tnBTS , _fpBTS, _fnBTS]),\ columns=['estimator','matthew', 'roc_auc', 'bacc', 'f1',\ 'TP', 'TN', 'FP', 'FN']),\ ignore_index=True) results_all['CrossValResultsDF'] = result_pd results_all['BlindTestResultsDF'] = result_bts_pd except Exception as e: print("XXXGot an error while running {}".format(name)) print(e) #return(result_pd) return(results_all)