#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Sat Mar 5 12:57:32 2022 @author: tanu """ #%% # Data, etc for now comes from my_data6.py and/or my_data5.py #%% Specify dir and import functions homedir = os.path.expanduser("~") os.chdir(homedir + "/git/ML_AI_training/") #%% Try combinations #import sys, os #os.system("imports.py") def precision(y_true,y_pred): return precision_score(y_true,y_pred,pos_label = 1) def recall(y_true,y_pred): return recall_score(y_true, y_pred, pos_label = 1) def f1(y_true,y_pred): return f1_score(y_true, y_pred, pos_label = 1) #%% Check df features numerical_features_df.shape categorical_features_df.shape all_features_df.shape all_features_df.dtypes #%% Simple train and test data splits target = target1 #target = target3 X_trainN, X_testN, y_trainN, y_testN = train_test_split(numerical_features_df, target, test_size = 0.33, random_state = 42) X_trainC, X_testC, y_trainC, y_testC = train_test_split(categorical_features_df, target, test_size = 0.33, random_state = 42) X_train, X_test, y_train, y_test = train_test_split(all_features_df, target, test_size = 0.33, random_state = 42) #%% Stratified K-fold: Single model input_df = numerical_features_df #X_array = np.array(input_df) var_type = 'numerical' input_df = all_features_df #X_array = np.array(input_df) var_type = 'mixed' input_df = categorical_features_df #X_array = np.array(input_df) var_type = 'categorical' y_targetF = target1 #============================================================================== numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns numerical_ix categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns categorical_ix # Determine preprocessing steps ~ var_type if var_type == 'numerical': t = [('num', MinMaxScaler(), numerical_ix)] if var_type == 'categorical': t = [('cat', OneHotEncoder(), categorical_ix)] if var_type == 'mixed': t = [('cat', OneHotEncoder(), categorical_ix) , ('num', MinMaxScaler(), numerical_ix)] ############################################################################### col_transform = ColumnTransformer(transformers = t , remainder='passthrough') ############################################################################### rs = {'random_state': 42} del(model1) model1 = Pipeline(steps = [('preprocess', MinMaxScaler()) , ('log_reg', LogisticRegression(class_weight = 'unbalanced')) ]) # model1 = Pipeline(steps = [('preprocess', MinMaxScaler()) # , ('log_reg', LogisticRegression(**rs)) ]) del(model1) nb = BernoulliNB() model1 = Pipeline(steps = [('preprocess', MinMaxScaler()) , ('nb', nb) ]) del(model1) knn = KNeighborsClassifier() model1 = Pipeline(steps = [('preprocess', MinMaxScaler()) , ('knn', knn) ]) del(model1) svm = SVC(**rs) model1 = Pipeline(steps = [('preprocess', MinMaxScaler()) , ('svm', svm) ]) del(model1) mlp = MLPClassifier(max_iter = 500, **rs) model1 = Pipeline(steps = [('preprocess', MinMaxScaler()) , ('mlp', mlp) ]) del(model1) dt = DecisionTreeClassifier(**rs) model1 = Pipeline(steps = [('preprocess', MinMaxScaler()) , ('dt', dt) ]) del(model1) et = ExtraTreesClassifier(**rs) model1 = Pipeline(steps = [('preprocess', MinMaxScaler()) , ('et', et) ]) del(model1) rf = RandomForestClassifier(**rs) model1 = Pipeline(steps = [('preprocess', MinMaxScaler()) , ('rf', rf) ]) ############################################################################### #%% run del(mm) skf = StratifiedKFold(n_splits = 10 , shuffle = True , **rs) #X_array = np.array(numerical_features_df) #Y = target1 model_scores_df = pd.DataFrame() fscoreL = [] mccL = [] presL = [] recallL = [] accuL = [] roc_aucL = [] # for train_index, test_index in skf.split(X_array, Y): # x_train_fold, x_test_fold = X_array[train_index], X_array[test_index] # y_train_fold, y_test_fold = Y[train_index], Y[test_index] for train_index, test_index in skf.split(input_df, y_targetF): x_train_fold, x_test_fold = input_df.iloc[train_index], input_df.iloc[test_index] y_train_fold, y_test_fold = y_targetF.iloc[train_index], y_targetF.iloc[test_index] model1.fit(x_train_fold, y_train_fold) y_pred_fold = model1.predict(x_test_fold) #---------------- # Model metrics #---------------- # F1-Score fscore = f1_score(y_test_fold, y_pred_fold) fscoreL.append(fscore) fscoreM = mean(fscoreL) # Matthews correlation coefficient mcc = matthews_corrcoef(y_test_fold, y_pred_fold) mccL.append(mcc) mccM = mean(mccL) # Precision pres = precision_score(y_test_fold, y_pred_fold) presL.append(pres) presM = mean(presL) # Recall recall = recall_score(y_test_fold, y_pred_fold) recallL.append(recall) recallM = mean(recallL) # Accuracy accu = accuracy_score(y_test_fold, y_pred_fold) accuL.append(accu) accuM = mean(accuL) # ROC_AUC roc_auc = roc_auc_score(y_test_fold, y_pred_fold) roc_aucL.append(roc_auc) roc_aucM = mean(roc_aucL) model_scores_df = model_scores_df.append({'Model' : model1.steps[1][0] ,'F1_score' : fscoreM , 'MCC' : mccM , 'Precision': presM , 'Recall' : recallM , 'Accuracy' : accuM , 'ROC_curve': roc_aucM} , ignore_index = True) print('\nModel metrics:\n', model_scores_df) mm = model_scores_df.mean() print('\nModel metrics mean:\n', mm) print('\nModel metrics:\n', model_scores_df)