#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Thu Mar 10 10:33:15 2022 @author: tanu """ #%% Stratified KFold: Multiple_models: input_df = numerical_features_df #X_array = np.array(input_df) var_type = 'numerical' input_df = all_features_df #X_array = np.array(input_df) var_type = 'mixed' input_df = categorical_features_df #X_array = np.array(input_df) var_type = 'categorical' targetF = target1 #============================================================================== numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns numerical_ix categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns categorical_ix # Determine preprocessing steps ~ var_type if var_type == 'numerical': t = [('num', MinMaxScaler(), numerical_ix)] if var_type == 'categorical': t = [('cat', OneHotEncoder(), categorical_ix)] if var_type == 'mixed': t = [('cat', OneHotEncoder(), categorical_ix) , ('num', MinMaxScaler(), numerical_ix)] ############################################################################### col_transform = ColumnTransformer(transformers = t , remainder='passthrough') ############################################################################### rs = {'random_state': 42} #log_reg = LogisticRegression(**rs) log_reg = LogisticRegression(class_weight = 'balanced') nb = BernoulliNB() rf = RandomForestClassifier(**rs) clfs = [('Logistic Regression', log_reg) ,('Naive Bayes' , nb) , ('Random Forest' , rf) ] #seed_skf = 42 skf = StratifiedKFold(n_splits = 10 , shuffle = True #, random_state = seed_skf , **rs) #scores_df = pd.DataFrame() fscoreL = [] mccL = [] presL = [] recallL = [] accuL = [] roc_aucL = [] # X_array = np.array(input_df) # Y = np.array(target1) # Y = target1 for train_index, test_index in skf.split(input_df, targetF): print('\nSKF train index:', train_index , '\nSKF test index:', test_index) x_train_fold, x_test_fold = input_df.iloc[train_index], input_df.iloc[test_index] y_train_fold, y_test_fold = targetF.iloc[train_index], targetF.iloc[test_index] # for train_index, test_index in skf.split(X_array, Y): # print('\nSKF train index:', train_index # , '\nSKF test index:', test_index) # x_train_fold, x_test_fold = X_array[train_index], X_array[test_index] # y_train_fold, y_test_fold = Y[train_index], Y[test_index] clf_scores_df = pd.DataFrame() for clf_name, clf in clfs: # model2 = Pipeline(steps=[('preprocess', MinMaxScaler()) # , ('classifier', clf)]) model2 = Pipeline(steps=[('preprocess', col_transform) , ('classifier', clf)]) model2.fit(x_train_fold, y_train_fold) y_pred_fold = model2.predict(x_test_fold) #---------------- # Model metrics #---------------- # F1-Score fscore = f1_score(y_test_fold, y_pred_fold) fscoreL.append(fscore) # print('fscoreL Len: ', len(fscoreL)) fscoreM = mean(fscoreL) # Matthews correlation coefficient mcc = matthews_corrcoef(y_test_fold, y_pred_fold) mccL.append(mcc) mccM = mean(mccL) # Precision pres = precision_score(y_test_fold, y_pred_fold) presL.append(pres) presM = mean(presL) # Recall recall = recall_score(y_test_fold, y_pred_fold) recallL.append(recall) recallM = mean(recallL) # Accuracy accu = accuracy_score(y_test_fold, y_pred_fold) accuL.append(accu) accuM = mean(accuL) # ROC_AUC roc_auc = roc_auc_score(y_test_fold, y_pred_fold) roc_aucL.append(roc_auc) roc_aucM = mean(roc_aucL) clf_scores_df = clf_scores_df.append({'Model': clf_name ,'F1_score' : fscoreM , 'MCC' : mccM , 'Precision': presM , 'Recall' : recallM , 'Accuracy' : accuM , 'ROC_curve': roc_aucM} , ignore_index = True) #scores_df = scores_df.append(clf_scores_df) #%% Call functions tN_res = MultClassPipeline(X_trainN, X_testN, y_trainN, y_testN) tN_res t2_res = MultClassPipeline2(X_train, X_test, y_train, y_test, input_df = all_features_df) t2_res #CHECK: numbers are awfully close to each other! t3_res = MultClassPipeSKF(input_df = numerical_features_df , y_targetF = target1 , var_type = 'numerical' , skf_splits = 10) t3_res #CHECK: numbers are awfully close to each other! t4_res = MultClassPipeSKF(input_df = all_features_df , y_targetF = target1 , var_type = 'mixed' , skf_splits = 10) t4_res