From 1bfb35c30c9e54f9090b0dc800c19bcc6ad07759 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Wed, 9 Mar 2022 18:35:54 +0000
Subject: [PATCH] trying Stratified Kfold split on running multiple pipelines

---
 MultClassPipe.py                         |  29 +--
 MultClassPipe2.py                        |  28 +--
 __pycache__/MultClassPipe.cpython-37.pyc | Bin 2451 -> 2530 bytes
 imports.py                               |  11 +-
 my_data10.py                             | 255 +++++++++++++++++++----
 my_data9.py                              |  24 ++-
 pnca_results_v1.py                       |  12 ++
 7 files changed, 287 insertions(+), 72 deletions(-)

diff --git a/MultClassPipe.py b/MultClassPipe.py
index 217bbe9..44506aa 100644
--- a/MultClassPipe.py
+++ b/MultClassPipe.py
@@ -20,7 +20,8 @@ from xgboost import XGBClassifier
 from sklearn.preprocessing import StandardScaler
 from sklearn.preprocessing import MinMaxScaler
 from sklearn.model_selection import train_test_split
-from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score
+from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score 
+from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef
 #%%
 rs = {'random_state': 42}
 # TODO: add preprocessing step with one hot encoder
@@ -63,7 +64,7 @@ def MultClassPipeline(X_train, X_test, y_train, y_test):
 
     pipelines = []
 
-    scores_df = pd.DataFrame(columns=['Model', 'F1_Score', 'Precision', 'Recall', 'Accuracy', 'ROC_AUC'])
+    scores_df = pd.DataFrame(columns=['Model', 'F1_Score', 'MCC', 'Precision', 'Recall', 'Accuracy', 'ROC_AUC'])
 
     for clf_name, clf in clfs:
 
@@ -83,24 +84,26 @@ def MultClassPipeline(X_train, X_test, y_train, y_test):
         # Precision
         pres    = precision_score(y_test, y_pred)
         # Recall
-        rcall   = recall_score(y_test, y_pred)
+        recall   = recall_score(y_test, y_pred)
         # Accuracy
         accu    = accuracy_score(y_test, y_pred)
         # ROC_AUC
         roc_auc = roc_auc_score(y_test, y_pred)
-
+        # Matthews correlation coefficient
+        mcc =  matthews_corrcoef(y_test, y_pred)
+        
         pipelines.append(pipeline)
 
         scores_df = scores_df.append({
-                                      'Model'     : clf_name, 
-                                      'F1_Score'  : fscore,
-                                      'Precision' : pres,
-                                      'Recall'    : rcall,
-                                      'Accuracy'  : accu,
-                                      'ROC_AUC'   : roc_auc
-                                      
-                                      }, 
-                                     ignore_index = True)
+                                      'Model'       : clf_name
+                                      , 'F1_Score'  : fscore
+                                      , 'MCC'       : mcc
+                                      , 'Precision' : pres
+                                      , 'Recall'    : recall
+                                      , 'Accuracy'  : accu
+                                      , 'ROC_AUC'   : roc_auc
+                                      }
+                                     , ignore_index = True)
         
     return pipelines, scores_df
 
diff --git a/MultClassPipe2.py b/MultClassPipe2.py
index e4ea381..9fe4619 100644
--- a/MultClassPipe2.py
+++ b/MultClassPipe2.py
@@ -21,7 +21,8 @@ from sklearn.compose import ColumnTransformer
 from sklearn.preprocessing import StandardScaler
 from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
 from sklearn.model_selection import train_test_split
-from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score
+from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score 
+from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef
 #%%
 rs = {'random_state': 42}
 # Done: add preprocessing step with one hot encoder
@@ -70,10 +71,9 @@ def MultClassPipeline2(X_train, X_test, y_train, y_test, input_df):
             ('XGBoost', xgb)
             ]
 
-
     pipelines = []
 
-    scores_df = pd.DataFrame(columns=['Model', 'F1_Score', 'Precision', 'Recall', 'Accuracy', 'ROC_AUC'])
+    scores_df = pd.DataFrame(columns=['Model', 'F1_Score', 'MCC', 'Precision', 'Recall', 'Accuracy', 'ROC_AUC'])
 
     for clf_name, clf in clfs:
 #%%
@@ -101,10 +101,12 @@ def MultClassPipeline2(X_train, X_test, y_train, y_test, input_df):
         
         # F1-Score
         fscore  = f1_score(y_test, y_pred)
+        # Matthews correlation coefficient
+        mcc =  matthews_corrcoef(y_test, y_pred)
         # Precision
         pres    = precision_score(y_test, y_pred)
         # Recall
-        rcall   = recall_score(y_test, y_pred)
+        recall   = recall_score(y_test, y_pred)
         # Accuracy
         accu    = accuracy_score(y_test, y_pred)
         # ROC_AUC
@@ -113,15 +115,15 @@ def MultClassPipeline2(X_train, X_test, y_train, y_test, input_df):
         pipelines.append(pipeline)
 
         scores_df = scores_df.append({
-                                      'Model'     : clf_name, 
-                                      'F1_Score'  : fscore,
-                                      'Precision' : pres,
-                                      'Recall'    : rcall,
-                                      'Accuracy'  : accu,
-                                      'ROC_AUC'   : roc_auc
-                                      
-                                      }, 
-                                     ignore_index = True)
+                                      'Model'       : clf_name 
+                                      , 'F1_Score'  : fscore
+                                      , 'MCC'       : mcc
+                                      , 'Precision' : pres
+                                      , 'Recall'    : recall
+                                      , 'Accuracy'  : accu
+                                      , 'ROC_AUC'   : roc_auc
+                                      }
+                                     , ignore_index = True)
         
     return pipelines, scores_df
 
diff --git a/__pycache__/MultClassPipe.cpython-37.pyc b/__pycache__/MultClassPipe.cpython-37.pyc
index 2156ad951b720a9ddd80148d090878057ef51153..b6c5c1bead891a2eca7e055f56ae121e74a25e40 100644
GIT binary patch
delta 551
zcmY+Azi-n(6vurR$1Wvtoj5<-q;1GA8bkRZ1WbSsK_H<bhN`V(6-DOGP6DwV&W;Gu
zWuY*3sBS|Hg#~qC=>mTO0*QgmLJa%`44s&`+u)tvM|$u3J>7f9j9-R!E1R8{cx-&>
zJbJI~YbH3}U;TU$NEl#!wCp$RhTpWCe#>t8ZM)6;Ih-E?`!qIi;iY7+Vz?n$#iIha
z0Ge{5qcHF$(J%~pz7vz-G|?P{Cd5I{dl3qW%RE?vmdswkc0R`~@xXl+^|(S%=z8oB
zE@@Rh&PkqKp3w{T9e!KGOMMkD?}3><eMhz1(!<v~aF(0p;|yMjv%8W9agF9Z*oXKO
z`<hz685e1a7IxvRLKhB7)OZVkMD@odnxjQpp(R?TCSAn!H!4o!#vYvMPk(Gnbb;!g
z*r&<X<vK7~Am0OJ_DTLIr|x&VY@B{@RV5uh7qJA+2%HsI6F4Vup2Gr3X4nt-y1ik5
z-KkY(ztaO#7rG%(;z$%{Jaz+YRoN5epr#NPIipb`i~mYg5~7|nK}pI-=uje?tfk&j
zR`|7Oz{cuzP+=d`#p*ni0sQAEFayapcUc|wQ>}t4M}O42;F5TbTow=-+58_{yiDe<
X`5~T++#4#dq81BI)D;t$a8CXUc(saY

delta 521
zcmY+APiqrF7{+&IH?wQqY&KbwrqybbrcJjh;-8~>h@v3kp%sKhS=OCRx{}?EGYO<P
zIq9`R8Stb~4<3512f<Gu9t4kb^wgUmegJ3esRQp1-kJB|eV+N{anI?M%X6AMKiAiu
z|0(Y|O>nf={np9aK|5OsR<cge0UCC3H3h*cu3_(u7OY{orTJ$Lm%$a#Gldhy@q|S2
zPDtZ|B*Kl0yg#97k%w6{Ch1gEMkI;T|2M}Yw(O(f@L0e(T;<>4S!YrDI4$drEIQ_Y
z;g5|0!sm9u%$>et&Ry-<+oy0=oz=!RUK*EQYki2%vsxbxAiltV>kn_R%6^@B9|6#q
z`=ZXOtj-#2o;BG5UVe{o3Ac9P%zY`IXsp8AzKYT2uZ$L$=O2s*puvxfFZ@&K(N#pc
z(gayo@D(-`E-GA-@PV+?K`zsVX^xYr-{3yl_A2V#lMpBxjglPui~IoXFO-N%D-88f
z!X!l;Mw9qJBb$8NyoZ*gfMWjHyyY~Z10k@WquctN&VkkJBHf1MvD(=cW*A~A4h1Zz
y(CH2pXcXl*qU8Ev&$<sTUsDFuFxfnbE!hg|W>(<IFu85Yj<)&NQ!S$dE#n_9RE=Z+

diff --git a/imports.py b/imports.py
index 0735ec6..2eaf070 100644
--- a/imports.py
+++ b/imports.py
@@ -21,12 +21,15 @@ from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
 
 from sklearn.compose import ColumnTransformer
 from sklearn.compose import make_column_transformer
-from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score
+
+from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score 
+from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef
 from sklearn.metrics import make_scorer
 from sklearn.metrics import classification_report
 
 from sklearn.model_selection import cross_validate
 from sklearn.model_selection import train_test_split
+from sklearn.model_selection import StratifiedKFold
 
 from sklearn.pipeline import Pipeline
 from sklearn.pipeline import make_pipeline
@@ -39,13 +42,15 @@ import matplotlib.pyplot as plt
 import numpy as np
 print(np.__version__)
 print(pd.__version__)
-from statistics import mean, stdev
+from statistics import mean, stdev, median, mode
 #%%
 homedir = os.path.expanduser("~")
 os.chdir(homedir + "/git/ML_AI_training/")
 
 # my function
-from MultClassPipe import MultClassPipeline 
+from MultClassPipe import MultClassPipeline
+from MultClassPipe2 import MultClassPipeline2
+from MultClassPipe3 import MultClassPipeSKF
 
 gene = 'pncA'
 drug = 'pyrazinamide'
diff --git a/my_data10.py b/my_data10.py
index d37ff41..43148f1 100644
--- a/my_data10.py
+++ b/my_data10.py
@@ -6,14 +6,11 @@ Created on Sat Mar  5 12:57:32 2022
 @author: tanu
 """
 #%%
-# data, etc for now  comes from my_data6.py and/or my_data5.py
-#%%
+# Data, etc for now  comes from my_data6.py and/or my_data5.py
+#%% Specify dir and import functions
 homedir = os.path.expanduser("~")
 os.chdir(homedir + "/git/ML_AI_training/")
-
-# my function
-from MultClassPipe2 import MultClassPipeline2 
-#%% try combinations
+#%% Try combinations
 #import sys, os
 #os.system("imports.py")
 def precision(y_true,y_pred):
@@ -23,13 +20,12 @@ def recall(y_true,y_pred):
 def f1(y_true,y_pred):
     return f1_score(y_true, y_pred, pos_label = 1)
 
-#%%
-
+#%% Check df features
 numerical_features_df.shape
 categorical_features_df.shape
 all_features_df.shape
 all_features_df.dtypes
-#%%
+#%% Simple train and test data splits
 target = target1
 #target = target3
 X_trainN, X_testN, y_trainN, y_testN = train_test_split(numerical_features_df, 
@@ -46,44 +42,231 @@ X_train, X_test, y_train, y_test = train_test_split(all_features_df,
                                                     target, 
                                                     test_size = 0.33, 
                                                     random_state = 42)
-#%%
+#%% Stratified K-fold: Single model
 
+model1 = Pipeline(steps = [('preprocess', MinMaxScaler())
+                               , ('log_reg', LogisticRegression(class_weight = 'balanced')) ])
+model1
+rs = {'random_state': 42}
+log_reg = LogisticRegression(**rs)
+nb = BernoulliNB()
+clfs = [('Logistic Regression', log_reg)
+        ,('Naive Bayes', nb)]
 
+seed_skf = 42
+skf = StratifiedKFold(n_splits = 10
+                      , shuffle = True
+                      , random_state = seed_skf)
 
-#%% with feature selection
+X_array = np.array(numerical_features_df)
+Y = target1
 
-# Determine categorical and numerical features
-input_df = numerical_features_df.copy()
-#input_df = categorical_features_df
-#input_df = all_features_df
+model_scores_df = pd.DataFrame()
+fscoreL      = []
+mccL         = []
+presL        = []
+recallL      = []
+accuL        = []
+roc_aucL     = []
 
-numerical_ix = all_features_df.select_dtypes(include=['int64', 'float64']).columns
+for train_index, test_index in skf.split(X_array, Y):
+    x_train_fold, x_test_fold = X_array[train_index], X_array[test_index]
+    y_train_fold, y_test_fold = Y[train_index], Y[test_index]
+
+    model1.fit(x_train_fold, y_train_fold)
+    y_pred_fold  = model1.predict(x_test_fold)
+  
+    #----------------
+    # Model metrics
+    #----------------     
+    # F1-Score
+    fscore = f1_score(y_test_fold, y_pred_fold)
+    fscoreL.append(fscore)
+    fscoreM = mean(fscoreL)
+     
+    # Matthews correlation coefficient
+    mcc = matthews_corrcoef(y_test_fold, y_pred_fold)
+    mccL.append(mcc)
+    mccM = mean(mccL)
+     
+    # Precision
+    pres = precision_score(y_test_fold, y_pred_fold)
+    presL.append(pres)
+    presM = mean(presL)
+    
+    # Recall
+    recall = recall_score(y_test_fold, y_pred_fold)
+    recallL.append(recall)
+    recallM = mean(recallL)            
+   
+    # Accuracy
+    accu = accuracy_score(y_test_fold, y_pred_fold)
+    accuL.append(accu)            
+    accuM = mean(accuL)
+    
+    # ROC_AUC
+    roc_auc = roc_auc_score(y_test_fold, y_pred_fold)
+    roc_aucL.append(roc_auc)            
+    roc_aucM = mean(roc_aucL)    
+         
+model_scores_df = model_scores_df.append({'Model'      : model1.steps[1][0]
+                                          ,'F1_score'  : fscoreM
+                                          , 'MCC'      : mccM
+                                          , 'Precision': presM
+                                          , 'Recall'   : recallM
+                                          , 'Accuracy' : accuM
+                                          , 'ROC_curve': roc_aucM}
+                                         , ignore_index = True)
+print('\nModel metrics:', model_scores_df)                     
+#%% stratified KFold: Multiple_models: 
+input_df = numerical_features_df
+#X_array = np.array(input_df)
+Y = target1
+var_type = 'numerical'
+
+input_df = all_features_df
+#X_array = np.array(input_df)
+Y = target1
+var_type = 'mixed'
+
+input_df = categorical_features_df
+#X_array = np.array(input_df)
+Y = target1
+var_type = 'categorical'    
+
+#=================
+numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
 numerical_ix
-categorical_ix = all_features_df.select_dtypes(include=['object', 'bool']).columns
-categorical_ix
 
-# prepare data
-t = [('num', MinMaxScaler(), numerical_ix)
-     , ('cat', OneHotEncoder(), categorical_ix)]
-      
+categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns
+categorical_ix    
+
+# Determine preprocessing steps ~ var_type
+if var_type == 'numerical':
+    t = [('num', MinMaxScaler(), numerical_ix)]
+
+if var_type == 'categorical':
+    t = [('cat', OneHotEncoder(), categorical_ix)]
+
+if var_type == 'mixed':
+    t = [('cat', OneHotEncoder(), categorical_ix)
+         , ('num', MinMaxScaler(), numerical_ix)]
+
+##############################   
 col_transform = ColumnTransformer(transformers = t
-                                  , remainder  = 'passthrough')
+                                   , remainder='passthrough')
 
-# model pipeline
-model = Pipeline(steps=[('prep', col_transform)
-                        , ('classifier', LogisticRegression())])
 
-model.fit(X_train, y_train)
-y_pred = model.predict(X_test)
-y_pred
+rs = {'random_state': 42}
 
-selector_log = RFECV(estimator = model
-                       , cv = 10
-                       , step = 1)
+#log_reg = LogisticRegression(**rs)
+log_reg = LogisticRegression(class_weight = 'balanced')
+nb = BernoulliNB()
+rf = RandomForestClassifier(**rs)
 
-selector_log_x = selector_log.fit_transform(X_train, y_train)
+clfs = [('Logistic Regression', log_reg)
+        ,('Naive Bayes', nb)
+        , ('Random Forest'      , rf) 
+        ]
 
-print(selector_log_x.get_support())
-X_trainN.columns
+#seed_skf = 42
+skf = StratifiedKFold(n_splits = 10
+                      , shuffle = True
+                      #, random_state = seed_skf
+                      , **rs)
+#scores_df  = pd.DataFrame()
+fscoreL      = []
+mccL         = []
+presL        = []
+recallL      = []
+accuL        = []
+roc_aucL     = []
 
-print(selector_logistic_x.ranking_)
\ No newline at end of file
+for train_index, test_index in skf.split(input_df, Y):
+    print('\nSKF train index:', train_index
+          , '\nSKF test index:', test_index)
+    x_train_fold, x_test_fold = input_df.iloc[train_index], input_df.iloc[test_index]
+    y_train_fold, y_test_fold = Y.iloc[train_index], Y.iloc[test_index]
+# for train_index, test_index in skf.split(X_array, Y):
+#      print('\nSKF train index:', train_index
+#            , '\nSKF test index:', test_index)
+    # x_train_fold, x_test_fold = X_array[train_index], X_array[test_index]
+    # y_train_fold, y_test_fold = Y[train_index], Y[test_index]
+
+
+    clf_scores_df = pd.DataFrame()
+    for clf_name, clf in clfs:   
+        # model2 = Pipeline(steps=[('preprocess', MinMaxScaler())
+        #                            , ('classifier', clf)])
+        model2 = Pipeline(steps=[('preprocess', col_transform)
+                                    , ('classifier', clf)])
+    
+        model2.fit(x_train_fold, y_train_fold)
+        y_pred_fold  = model2.predict(x_test_fold)
+     
+        #----------------
+        # Model metrics
+        #----------------     
+        # F1-Score
+        fscore = f1_score(y_test_fold, y_pred_fold)
+        fscoreL.append(fscore)
+        fscoreM = mean(fscoreL)
+        
+        # Matthews correlation coefficient
+        mcc = matthews_corrcoef(y_test_fold, y_pred_fold)
+        mccL.append(mcc)
+        mccM = mean(mccL)
+        
+        # Precision
+        pres = precision_score(y_test_fold, y_pred_fold)
+        presL.append(pres)
+        presM = mean(presL)
+        
+        # Recall
+        recall = recall_score(y_test_fold, y_pred_fold)
+        recallL.append(recall)
+        recallM = mean(recallL)            
+       
+        # Accuracy
+        accu = accuracy_score(y_test_fold, y_pred_fold)
+        accuL.append(accu)            
+        accuM = mean(accuL)
+        
+        # ROC_AUC
+        roc_auc = roc_auc_score(y_test_fold, y_pred_fold)
+        roc_aucL.append(roc_auc)            
+        roc_aucM = mean(roc_aucL)    
+            
+        clf_scores_df = clf_scores_df.append({'Model': clf_name 
+                                              ,'F1_score'  : fscoreM
+                                              , 'MCC'      : mccM
+                                              , 'Precision': presM
+                                              , 'Recall'   : recallM
+                                              , 'Accuracy' : accuM
+                                              , 'ROC_curve': roc_aucM}
+                                             , ignore_index = True)
+    #scores_df = scores_df.append(clf_scores_df)
+                        
+    
+#%% Call functions
+
+tN_res = MultClassPipeline(X_trainN, X_testN, y_trainN, y_testN)
+tN_res
+
+t2_res = MultClassPipeline2(X_train, X_test, y_train, y_test, input_df = all_features_df)
+t2_res
+
+#CHECK: numbers are awfully close to each other!
+
+t3_res = MultClassPipeSKF(input_df = numerical_features_df
+                          , y_targetF = target1
+                          , var_type = 'numerical'
+                          , skf_splits = 10)
+t3_res
+
+#CHECK: numbers are awfully close to each other!
+t4_res = MultClassPipeSKF(input_df = all_features_df
+                          , y_targetF = target1
+                          , var_type = 'mixed'
+                          , skf_splits = 10)
+t4_res    
\ No newline at end of file
diff --git a/my_data9.py b/my_data9.py
index 0b1f4b8..7c6d05f 100644
--- a/my_data9.py
+++ b/my_data9.py
@@ -7,12 +7,6 @@ Created on Sat Mar  5 12:57:32 2022
 """
 #%%
 # data, etc for now  comes from my_data6.py and/or my_data5.py
-#%%
-homedir = os.path.expanduser("~")
-os.chdir(homedir + "/git/ML_AI_training/")
-
-# my function
-from MultClassPipe2 import MultClassPipeline2 
 #%% try combinations
 #import sys, os
 #os.system("imports.py")
@@ -130,5 +124,21 @@ pipeline = Pipeline(steps=[('prep', col_transform)
                                    , ('classifier', LogisticRegression())])
 #%% Added this to the MultClassPipeline
 
+tN_res = MultClassPipeline(X_trainN, X_testN, y_trainN, y_testN)
+tN_res
+
 t2_res = MultClassPipeline2(X_train, X_test, y_train, y_test, input_df = all_features_df)
-t2_res
\ No newline at end of file
+t2_res
+
+t3_res = MultClassPipeSKF(input_df = numerical_features_df
+                          , y_targetF = target1
+                          , var_type = 'numerical'
+                          , skf_splits = 10)
+t3_res
+
+
+t4_res = MultClassPipeSKF(input_df = all_features_df
+                          , y_targetF = target1
+                          , var_type = 'mixed'
+                          , skf_splits = 10)
+t4_res
\ No newline at end of file
diff --git a/pnca_results_v1.py b/pnca_results_v1.py
index 7d8b097..2c6724d 100644
--- a/pnca_results_v1.py
+++ b/pnca_results_v1.py
@@ -85,3 +85,15 @@ all_features: numerical_features + ['ss_class', 'wt_prop_water', 'mut_prop_water
  9All              XGBoost  0.710526   0.692308  0.729730  0.685714  0.683047)
    
 
+#%%
+                  Model  F1_Score  Precision    Recall  Accuracy   ROC_AUC
+ 0  Logistic Regression  0.757764   0.701149  0.824324  0.721429  0.715192
+ 1          Naive Bayes  0.628571   0.666667  0.594595  0.628571  0.630631
+ 2  K-Nearest Neighbors  0.666667   0.623529  0.716216  0.621429  0.615684
+ 3                  SVM  0.766467   0.688172  0.864865  0.721429  0.712735
+ 4                  MLP  0.726115   0.686747  0.770270  0.692857  0.688165
+ 5        Decision Tree  0.647482   0.692308  0.608108  0.650000  0.652539
+ 6          Extra Trees  0.760736   0.696629  0.837838  0.721429  0.714373
+ 7        Random Forest  0.736196   0.674157  0.810811  0.692857  0.685708
+ 8       Random Forest2  0.736196   0.674157  0.810811  0.692857  0.685708
+ 9              XGBoost  0.710526   0.692308  0.729730  0.685714  0.683047)
\ No newline at end of file