#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 10 10:33:15 2022

@author: tanu
"""
#%% Stratified KFold: Multiple_models: 
input_df = numerical_features_df
#X_array = np.array(input_df)
var_type = 'numerical'

input_df = all_features_df
#X_array = np.array(input_df)
var_type = 'mixed'

input_df = categorical_features_df
#X_array = np.array(input_df)
var_type = 'categorical'    

targetF = target1
#==============================================================================
numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
numerical_ix

categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns
categorical_ix    
# Determine preprocessing steps ~ var_type
if var_type == 'numerical':
    t = [('num', MinMaxScaler(), numerical_ix)]

if var_type == 'categorical':
    t = [('cat', OneHotEncoder(), categorical_ix)]

if var_type == 'mixed':
    t = [('cat', OneHotEncoder(), categorical_ix)
         , ('num', MinMaxScaler(), numerical_ix)]

###############################################################################  
col_transform = ColumnTransformer(transformers = t
                                   , remainder='passthrough')

###############################################################################
rs = {'random_state': 42}

#log_reg = LogisticRegression(**rs)
log_reg = LogisticRegression(class_weight = 'balanced')
nb = BernoulliNB()
rf = RandomForestClassifier(**rs)

clfs = [('Logistic Regression', log_reg)
        ,('Naive Bayes'       , nb)
        , ('Random Forest'    , rf) 
        ]

#seed_skf = 42
skf = StratifiedKFold(n_splits = 10
                      , shuffle = True
                      #, random_state = seed_skf
                      , **rs)
#scores_df  = pd.DataFrame()
fscoreL      = []
mccL         = []
presL        = []
recallL      = []
accuL        = []
roc_aucL     = []

# X_array = np.array(input_df)
# Y = np.array(target1)
# Y = target1

for train_index, test_index in skf.split(input_df, targetF):
    print('\nSKF train index:', train_index
          , '\nSKF test index:', test_index)
    x_train_fold, x_test_fold = input_df.iloc[train_index], input_df.iloc[test_index]
    y_train_fold, y_test_fold = targetF.iloc[train_index], targetF.iloc[test_index]
# for train_index, test_index in skf.split(X_array, Y):
#      print('\nSKF train index:', train_index
#            , '\nSKF test index:', test_index)
    # x_train_fold, x_test_fold = X_array[train_index], X_array[test_index]
    # y_train_fold, y_test_fold = Y[train_index], Y[test_index]


    clf_scores_df = pd.DataFrame()
    for clf_name, clf in clfs:   
        # model2 = Pipeline(steps=[('preprocess', MinMaxScaler())
        #                            , ('classifier', clf)])
        model2 = Pipeline(steps=[('preprocess', col_transform)
                                    , ('classifier', clf)])
    
        model2.fit(x_train_fold, y_train_fold)
        y_pred_fold  = model2.predict(x_test_fold)
     
        #----------------
        # Model metrics
        #----------------     
        # F1-Score
        fscore = f1_score(y_test_fold, y_pred_fold)
        fscoreL.append(fscore)
#        print('fscoreL Len: ', len(fscoreL))
        fscoreM = mean(fscoreL)
        
        # Matthews correlation coefficient
        mcc = matthews_corrcoef(y_test_fold, y_pred_fold)
        mccL.append(mcc)
        mccM = mean(mccL)
        
        # Precision
        pres = precision_score(y_test_fold, y_pred_fold)
        presL.append(pres)
        presM = mean(presL)
        
        # Recall
        recall = recall_score(y_test_fold, y_pred_fold)
        recallL.append(recall)
        recallM = mean(recallL)            
       
        # Accuracy
        accu = accuracy_score(y_test_fold, y_pred_fold)
        accuL.append(accu)            
        accuM = mean(accuL)
        
        # ROC_AUC
        roc_auc = roc_auc_score(y_test_fold, y_pred_fold)
        roc_aucL.append(roc_auc)            
        roc_aucM = mean(roc_aucL)    
            
        clf_scores_df = clf_scores_df.append({'Model': clf_name 
                                              ,'F1_score'  : fscoreM
                                              , 'MCC'      : mccM
                                              , 'Precision': presM
                                              , 'Recall'   : recallM
                                              , 'Accuracy' : accuM
                                              , 'ROC_curve': roc_aucM}
                                             , ignore_index = True)
    #scores_df = scores_df.append(clf_scores_df)
                        
    
#%% Call functions

tN_res = MultClassPipeline(X_trainN, X_testN, y_trainN, y_testN)
tN_res

t2_res = MultClassPipeline2(X_train, X_test, y_train, y_test, input_df = all_features_df)
t2_res

#CHECK: numbers are awfully close to each other!

t3_res = MultClassPipeSKF(input_df = numerical_features_df
                          , y_targetF = target1
                          , var_type = 'numerical'
                          , skf_splits = 10)
t3_res

#CHECK: numbers are awfully close to each other!
t4_res = MultClassPipeSKF(input_df = all_features_df
                          , y_targetF = target1
                          , var_type = 'mixed'
                          , skf_splits = 10)
t4_res