trying Stratified Kfold split on running multiple pipelines

2022-03-09 18:35:54 +00:00 · 2022-03-09 18:35:54 +00:00 · 1bfb35c30c
commit 1bfb35c30c
parent bb8f6f70ba
7 changed files with 287 additions and 72 deletions
--- a/MultClassPipe.py
+++ b/MultClassPipe.py
@ -20,7 +20,8 @@ from xgboost import XGBClassifier
 from sklearn.preprocessing import StandardScaler
 from sklearn.preprocessing import MinMaxScaler
 from sklearn.model_selection import train_test_split
-from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score
+from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score 
+from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef
 #%%
 rs = {'random_state': 42}
 # TODO: add preprocessing step with one hot encoder
@ -63,7 +64,7 @@ def MultClassPipeline(X_train, X_test, y_train, y_test):

    pipelines = []

-    scores_df = pd.DataFrame(columns=['Model', 'F1_Score', 'Precision', 'Recall', 'Accuracy', 'ROC_AUC'])
+    scores_df = pd.DataFrame(columns=['Model', 'F1_Score', 'MCC', 'Precision', 'Recall', 'Accuracy', 'ROC_AUC'])

    for clf_name, clf in clfs:

@ -83,24 +84,26 @@ def MultClassPipeline(X_train, X_test, y_train, y_test):
        # Precision
        pres    = precision_score(y_test, y_pred)
        # Recall
-        rcall   = recall_score(y_test, y_pred)
+        recall   = recall_score(y_test, y_pred)
        # Accuracy
        accu    = accuracy_score(y_test, y_pred)
        # ROC_AUC
        roc_auc = roc_auc_score(y_test, y_pred)
+        # Matthews correlation coefficient
+        mcc =  matthews_corrcoef(y_test, y_pred)
        
        pipelines.append(pipeline)

        scores_df = scores_df.append({
-                                      'Model'     : clf_name, 
-                                      'F1_Score'  : fscore,
-                                      'Precision' : pres,
-                                      'Recall'    : rcall,
-                                      'Accuracy'  : accu,
-                                      'ROC_AUC'   : roc_auc
-                                      
-                                      }, 
-                                     ignore_index = True)
+                                      'Model'       : clf_name
+                                      , 'F1_Score'  : fscore
+                                      , 'MCC'       : mcc
+                                      , 'Precision' : pres
+                                      , 'Recall'    : recall
+                                      , 'Accuracy'  : accu
+                                      , 'ROC_AUC'   : roc_auc
+                                      }
+                                     , ignore_index = True)
        
    return pipelines, scores_df

--- a/MultClassPipe2.py
+++ b/MultClassPipe2.py
@ -21,7 +21,8 @@ from sklearn.compose import ColumnTransformer
 from sklearn.preprocessing import StandardScaler
 from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
 from sklearn.model_selection import train_test_split
-from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score
+from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score 
+from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef
 #%%
 rs = {'random_state': 42}
 # Done: add preprocessing step with one hot encoder
@ -70,10 +71,9 @@ def MultClassPipeline2(X_train, X_test, y_train, y_test, input_df):
            ('XGBoost', xgb)
            ]

-
    pipelines = []

-    scores_df = pd.DataFrame(columns=['Model', 'F1_Score', 'Precision', 'Recall', 'Accuracy', 'ROC_AUC'])
+    scores_df = pd.DataFrame(columns=['Model', 'F1_Score', 'MCC', 'Precision', 'Recall', 'Accuracy', 'ROC_AUC'])

    for clf_name, clf in clfs:
 #%%
@ -101,10 +101,12 @@ def MultClassPipeline2(X_train, X_test, y_train, y_test, input_df):
        
        # F1-Score
        fscore  = f1_score(y_test, y_pred)
+        # Matthews correlation coefficient
+        mcc =  matthews_corrcoef(y_test, y_pred)
        # Precision
        pres    = precision_score(y_test, y_pred)
        # Recall
-        rcall   = recall_score(y_test, y_pred)
+        recall   = recall_score(y_test, y_pred)
        # Accuracy
        accu    = accuracy_score(y_test, y_pred)
        # ROC_AUC
@ -113,15 +115,15 @@ def MultClassPipeline2(X_train, X_test, y_train, y_test, input_df):
        pipelines.append(pipeline)

        scores_df = scores_df.append({
-                                      'Model'     : clf_name, 
-                                      'F1_Score'  : fscore,
-                                      'Precision' : pres,
-                                      'Recall'    : rcall,
-                                      'Accuracy'  : accu,
-                                      'ROC_AUC'   : roc_auc
-                                      
-                                      }, 
-                                     ignore_index = True)
+                                      'Model'       : clf_name 
+                                      , 'F1_Score'  : fscore
+                                      , 'MCC'       : mcc
+                                      , 'Precision' : pres
+                                      , 'Recall'    : recall
+                                      , 'Accuracy'  : accu
+                                      , 'ROC_AUC'   : roc_auc
+                                      }
+                                     , ignore_index = True)
        
    return pipelines, scores_df

--- a/pycache/MultClassPipe.cpython-37.pyc
+++ b/pycache/MultClassPipe.cpython-37.pyc
--- a/imports.py
+++ b/imports.py
@ -21,12 +21,15 @@ from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder

 from sklearn.compose import ColumnTransformer
 from sklearn.compose import make_column_transformer
-from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score
+
+from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score 
+from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef
 from sklearn.metrics import make_scorer
 from sklearn.metrics import classification_report

 from sklearn.model_selection import cross_validate
 from sklearn.model_selection import train_test_split
+from sklearn.model_selection import StratifiedKFold

 from sklearn.pipeline import Pipeline
 from sklearn.pipeline import make_pipeline
@ -39,13 +42,15 @@ import matplotlib.pyplot as plt
 import numpy as np
 print(np.__version__)
 print(pd.__version__)
-from statistics import mean, stdev
+from statistics import mean, stdev, median, mode
 #%%
 homedir = os.path.expanduser("~")
 os.chdir(homedir + "/git/ML_AI_training/")

 # my function
 from MultClassPipe import MultClassPipeline
+from MultClassPipe2 import MultClassPipeline2
+from MultClassPipe3 import MultClassPipeSKF

 gene = 'pncA'
 drug = 'pyrazinamide'
--- a/my_data10.py
+++ b/my_data10.py
@ -6,14 +6,11 @@ Created on Sat Mar  5 12:57:32 2022
@author: tanu
 """
 #%%
-# data, etc for now  comes from my_data6.py and/or my_data5.py
-#%%
+# Data, etc for now  comes from my_data6.py and/or my_data5.py
+#%% Specify dir and import functions
 homedir = os.path.expanduser("~")
 os.chdir(homedir + "/git/ML_AI_training/")
-
-# my function
-from MultClassPipe2 import MultClassPipeline2 
-#%% try combinations
+#%% Try combinations
 #import sys, os
 #os.system("imports.py")
 def precision(y_true,y_pred):
@ -23,13 +20,12 @@ def recall(y_true,y_pred):
 def f1(y_true,y_pred):
    return f1_score(y_true, y_pred, pos_label = 1)

-#%%
-
+#%% Check df features
 numerical_features_df.shape
 categorical_features_df.shape
 all_features_df.shape
 all_features_df.dtypes
-#%%
+#%% Simple train and test data splits
 target = target1
 #target = target3
 X_trainN, X_testN, y_trainN, y_testN = train_test_split(numerical_features_df, 
@ -46,44 +42,231 @@ X_train, X_test, y_train, y_test = train_test_split(all_features_df,
                                                    target, 
                                                    test_size = 0.33, 
                                                    random_state = 42)
-#%%
+#%% Stratified K-fold: Single model

+model1 = Pipeline(steps = [('preprocess', MinMaxScaler())
+                               , ('log_reg', LogisticRegression(class_weight = 'balanced')) ])
+model1
+rs = {'random_state': 42}
+log_reg = LogisticRegression(**rs)
+nb = BernoulliNB()
+clfs = [('Logistic Regression', log_reg)
+        ,('Naive Bayes', nb)]

+seed_skf = 42
+skf = StratifiedKFold(n_splits = 10
+                      , shuffle = True
+                      , random_state = seed_skf)

-#%% with feature selection
+X_array = np.array(numerical_features_df)
+Y = target1

-# Determine categorical and numerical features
-input_df = numerical_features_df.copy()
-#input_df = categorical_features_df
-#input_df = all_features_df
+model_scores_df = pd.DataFrame()
+fscoreL      = []
+mccL         = []
+presL        = []
+recallL      = []
+accuL        = []
+roc_aucL     = []

-numerical_ix = all_features_df.select_dtypes(include=['int64', 'float64']).columns
+for train_index, test_index in skf.split(X_array, Y):
+    x_train_fold, x_test_fold = X_array[train_index], X_array[test_index]
+    y_train_fold, y_test_fold = Y[train_index], Y[test_index]
+
+    model1.fit(x_train_fold, y_train_fold)
+    y_pred_fold  = model1.predict(x_test_fold)
+  
+    #----------------
+    # Model metrics
+    #----------------     
+    # F1-Score
+    fscore = f1_score(y_test_fold, y_pred_fold)
+    fscoreL.append(fscore)
+    fscoreM = mean(fscoreL)
+     
+    # Matthews correlation coefficient
+    mcc = matthews_corrcoef(y_test_fold, y_pred_fold)
+    mccL.append(mcc)
+    mccM = mean(mccL)
+     
+    # Precision
+    pres = precision_score(y_test_fold, y_pred_fold)
+    presL.append(pres)
+    presM = mean(presL)
+    
+    # Recall
+    recall = recall_score(y_test_fold, y_pred_fold)
+    recallL.append(recall)
+    recallM = mean(recallL)            
+   
+    # Accuracy
+    accu = accuracy_score(y_test_fold, y_pred_fold)
+    accuL.append(accu)            
+    accuM = mean(accuL)
+    
+    # ROC_AUC
+    roc_auc = roc_auc_score(y_test_fold, y_pred_fold)
+    roc_aucL.append(roc_auc)            
+    roc_aucM = mean(roc_aucL)    
+         
+model_scores_df = model_scores_df.append({'Model'      : model1.steps[1][0]
+                                          ,'F1_score'  : fscoreM
+                                          , 'MCC'      : mccM
+                                          , 'Precision': presM
+                                          , 'Recall'   : recallM
+                                          , 'Accuracy' : accuM
+                                          , 'ROC_curve': roc_aucM}
+                                         , ignore_index = True)
+print('\nModel metrics:', model_scores_df)                     
+#%% stratified KFold: Multiple_models: 
+input_df = numerical_features_df
+#X_array = np.array(input_df)
+Y = target1
+var_type = 'numerical'
+
+input_df = all_features_df
+#X_array = np.array(input_df)
+Y = target1
+var_type = 'mixed'
+
+input_df = categorical_features_df
+#X_array = np.array(input_df)
+Y = target1
+var_type = 'categorical'    
+
+#=================
+numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
 numerical_ix
-categorical_ix = all_features_df.select_dtypes(include=['object', 'bool']).columns
+
+categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns
 categorical_ix    

-# prepare data
-t = [('num', MinMaxScaler(), numerical_ix)
-     , ('cat', OneHotEncoder(), categorical_ix)]
+# Determine preprocessing steps ~ var_type
+if var_type == 'numerical':
+    t = [('num', MinMaxScaler(), numerical_ix)]

+if var_type == 'categorical':
+    t = [('cat', OneHotEncoder(), categorical_ix)]
+
+if var_type == 'mixed':
+    t = [('cat', OneHotEncoder(), categorical_ix)
+         , ('num', MinMaxScaler(), numerical_ix)]
+
+##############################   
 col_transform = ColumnTransformer(transformers = t
                                   , remainder='passthrough')

-# model pipeline
-model = Pipeline(steps=[('prep', col_transform)
-                        , ('classifier', LogisticRegression())])

-model.fit(X_train, y_train)
-y_pred = model.predict(X_test)
-y_pred
+rs = {'random_state': 42}

-selector_log = RFECV(estimator = model
-                       , cv = 10
-                       , step = 1)
+#log_reg = LogisticRegression(**rs)
+log_reg = LogisticRegression(class_weight = 'balanced')
+nb = BernoulliNB()
+rf = RandomForestClassifier(**rs)

-selector_log_x = selector_log.fit_transform(X_train, y_train)
+clfs = [('Logistic Regression', log_reg)
+        ,('Naive Bayes', nb)
+        , ('Random Forest'      , rf) 
+        ]

-print(selector_log_x.get_support())
-X_trainN.columns
+#seed_skf = 42
+skf = StratifiedKFold(n_splits = 10
+                      , shuffle = True
+                      #, random_state = seed_skf
+                      , **rs)
+#scores_df  = pd.DataFrame()
+fscoreL      = []
+mccL         = []
+presL        = []
+recallL      = []
+accuL        = []
+roc_aucL     = []

-print(selector_logistic_x.ranking_)
+for train_index, test_index in skf.split(input_df, Y):
+    print('\nSKF train index:', train_index
+          , '\nSKF test index:', test_index)
+    x_train_fold, x_test_fold = input_df.iloc[train_index], input_df.iloc[test_index]
+    y_train_fold, y_test_fold = Y.iloc[train_index], Y.iloc[test_index]
+# for train_index, test_index in skf.split(X_array, Y):
+#      print('\nSKF train index:', train_index
+#            , '\nSKF test index:', test_index)
+    # x_train_fold, x_test_fold = X_array[train_index], X_array[test_index]
+    # y_train_fold, y_test_fold = Y[train_index], Y[test_index]
+
+
+    clf_scores_df = pd.DataFrame()
+    for clf_name, clf in clfs:   
+        # model2 = Pipeline(steps=[('preprocess', MinMaxScaler())
+        #                            , ('classifier', clf)])
+        model2 = Pipeline(steps=[('preprocess', col_transform)
+                                    , ('classifier', clf)])
+    
+        model2.fit(x_train_fold, y_train_fold)
+        y_pred_fold  = model2.predict(x_test_fold)
+     
+        #----------------
+        # Model metrics
+        #----------------     
+        # F1-Score
+        fscore = f1_score(y_test_fold, y_pred_fold)
+        fscoreL.append(fscore)
+        fscoreM = mean(fscoreL)
+        
+        # Matthews correlation coefficient
+        mcc = matthews_corrcoef(y_test_fold, y_pred_fold)
+        mccL.append(mcc)
+        mccM = mean(mccL)
+        
+        # Precision
+        pres = precision_score(y_test_fold, y_pred_fold)
+        presL.append(pres)
+        presM = mean(presL)
+        
+        # Recall
+        recall = recall_score(y_test_fold, y_pred_fold)
+        recallL.append(recall)
+        recallM = mean(recallL)            
+       
+        # Accuracy
+        accu = accuracy_score(y_test_fold, y_pred_fold)
+        accuL.append(accu)            
+        accuM = mean(accuL)
+        
+        # ROC_AUC
+        roc_auc = roc_auc_score(y_test_fold, y_pred_fold)
+        roc_aucL.append(roc_auc)            
+        roc_aucM = mean(roc_aucL)    
+            
+        clf_scores_df = clf_scores_df.append({'Model': clf_name 
+                                              ,'F1_score'  : fscoreM
+                                              , 'MCC'      : mccM
+                                              , 'Precision': presM
+                                              , 'Recall'   : recallM
+                                              , 'Accuracy' : accuM
+                                              , 'ROC_curve': roc_aucM}
+                                             , ignore_index = True)
+    #scores_df = scores_df.append(clf_scores_df)
+                        
+    
+#%% Call functions
+
+tN_res = MultClassPipeline(X_trainN, X_testN, y_trainN, y_testN)
+tN_res
+
+t2_res = MultClassPipeline2(X_train, X_test, y_train, y_test, input_df = all_features_df)
+t2_res
+
+#CHECK: numbers are awfully close to each other!
+
+t3_res = MultClassPipeSKF(input_df = numerical_features_df
+                          , y_targetF = target1
+                          , var_type = 'numerical'
+                          , skf_splits = 10)
+t3_res
+
+#CHECK: numbers are awfully close to each other!
+t4_res = MultClassPipeSKF(input_df = all_features_df
+                          , y_targetF = target1
+                          , var_type = 'mixed'
+                          , skf_splits = 10)
+t4_res    
--- a/my_data9.py
+++ b/my_data9.py
@ -7,12 +7,6 @@ Created on Sat Mar  5 12:57:32 2022
 """
 #%%
 # data, etc for now  comes from my_data6.py and/or my_data5.py
-#%%
-homedir = os.path.expanduser("~")
-os.chdir(homedir + "/git/ML_AI_training/")
-
-# my function
-from MultClassPipe2 import MultClassPipeline2 
 #%% try combinations
 #import sys, os
 #os.system("imports.py")
@ -130,5 +124,21 @@ pipeline = Pipeline(steps=[('prep', col_transform)
                                   , ('classifier', LogisticRegression())])
 #%% Added this to the MultClassPipeline

+tN_res = MultClassPipeline(X_trainN, X_testN, y_trainN, y_testN)
+tN_res
+
 t2_res = MultClassPipeline2(X_train, X_test, y_train, y_test, input_df = all_features_df)
 t2_res
+
+t3_res = MultClassPipeSKF(input_df = numerical_features_df
+                          , y_targetF = target1
+                          , var_type = 'numerical'
+                          , skf_splits = 10)
+t3_res
+
+
+t4_res = MultClassPipeSKF(input_df = all_features_df
+                          , y_targetF = target1
+                          , var_type = 'mixed'
+                          , skf_splits = 10)
+t4_res
--- a/pnca_results_v1.py
+++ b/pnca_results_v1.py
@ -85,3 +85,15 @@ all_features: numerical_features + ['ss_class', 'wt_prop_water', 'mut_prop_water
 9All              XGBoost  0.710526   0.692308  0.729730  0.685714  0.683047)
   

+#%%
+                  Model  F1_Score  Precision    Recall  Accuracy   ROC_AUC
+ 0  Logistic Regression  0.757764   0.701149  0.824324  0.721429  0.715192
+ 1          Naive Bayes  0.628571   0.666667  0.594595  0.628571  0.630631
+ 2  K-Nearest Neighbors  0.666667   0.623529  0.716216  0.621429  0.615684
+ 3                  SVM  0.766467   0.688172  0.864865  0.721429  0.712735
+ 4                  MLP  0.726115   0.686747  0.770270  0.692857  0.688165
+ 5        Decision Tree  0.647482   0.692308  0.608108  0.650000  0.652539
+ 6          Extra Trees  0.760736   0.696629  0.837838  0.721429  0.714373
+ 7        Random Forest  0.736196   0.674157  0.810811  0.692857  0.685708
+ 8       Random Forest2  0.736196   0.674157  0.810811  0.692857  0.685708
+ 9              XGBoost  0.710526   0.692308  0.729730  0.685714  0.683047)