import pandas as pd import numpy as np import scipy as sp import time import sys import os import re import argparse from math import sqrt from scipy import stats import joblib # Alogorithm from xgboost.sklearn import XGBClassifier from sklearn import svm from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier from sklearn.gaussian_process import GaussianProcessClassifier from sklearn.ensemble import AdaBoostClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.neural_network import MLPRegressor from sklearn.utils import all_estimators # Pre-processing from sklearn import preprocessing from sklearn.preprocessing import StandardScaler from sklearn.datasets import make_classification from sklearn.pipeline import Pipeline, make_pipeline from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, LeaveOneOut, KFold, RepeatedKFold, cross_val_predict # Metric from sklearn.metrics import mean_squared_error, make_scorer, roc_auc_score, f1_score, matthews_corrcoef, accuracy_score, balanced_accuracy_score, confusion_matrix, classification_report #def run_all_ML(input_pd, target_label, bts_input, bts_target, var_type): def run_all_ML(input_pd, target_label, preprocess = True, var_type = 'numerical'): #y = input_pd[target_label] #X = input_pd.drop(target_label,axis=1) y = target_label X = input_pd # determine categorical and numerical features numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns numerical_ix categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns categorical_ix # Determine preprocessing steps ~ var_type if var_type == 'numerical': t = [('num', MinMaxScaler(), numerical_ix)] if var_type == 'categorical': t = [('cat', OneHotEncoder(), categorical_ix)] if var_type == 'mixed': t = [('num', MinMaxScaler(), numerical_ix) , ('cat', OneHotEncoder(), categorical_ix)] col_transform = ColumnTransformer(transformers = t , remainder='passthrough') result_pd = pd.DataFrame() for name, algorithm in all_estimators(type_filter="classifier"): try: estmator = algorithm() temp_pd = pd.DataFrame() temp_cm = pd.DataFrame() # orig pipe = Pipeline([ ("model" , algorithm()) ]) # turn on and off preprocessing if preprocess == True: pipe = Pipeline([ ('prep' , col_transform), ("model" , algorithm()) ]) else: pipe = Pipeline([ ("model" , algorithm()) ]) y_pred = cross_val_predict(pipe, X, y, cv = 10, n_jobs=10) _mcc = round(matthews_corrcoef(y_pred, y), 3) _bacc = round(balanced_accuracy_score(y_pred, y), 3) _f1 = round(f1_score(y_pred, y), 3) _roc_auc = round(roc_auc_score(y_pred, y), 3) _tn, _fp, _fn, _tp = confusion_matrix(y_pred, y).ravel() result_pd = result_pd.append(pd.DataFrame(np.column_stack([name, _tp, _tn, _fp, _fn, _roc_auc, _mcc, _bacc, _f1]),\ columns=['estimator', 'TP', 'TN', 'FP', 'FN', 'roc_auc', 'matthew', 'bacc', 'f1']),\ ignore_index=True) except Exception as e: print("Got an error while running {}".format(name)) print(e) return(result_pd)