added loopity_loop to run multiple models with stratified k-fold, got stuck in infinite loops and nested dicts

2022-03-14 10:36:19 +00:00 · 2022-03-14 10:36:19 +00:00 · 7aead2d4f4
commit 7aead2d4f4
parent 69d0c1b557
18 changed files with 287 additions and 62 deletions
--- a/SKF_SSF.txt
+++ b/SKF_SSF.txt
@ -0,0 +1,48 @@
+# Stratified K-fold vs ShuffleSplit
+
+https://stackoverflow.com/questions/45969390/difference-between-stratifiedkfold-and-stratifiedshufflesplit-in-sklearn
+
+In ShuffleSplit, the data is shuffled every time, and then split. This means the test sets may overlap between the splits.
+In SKF, test sets don't overlap
+
+So, the difference here is that StratifiedKFold just shuffles and splits once, therefore the test sets do not overlap, while StratifiedShuffleSplit shuffles each time before splitting, and it splits n_splits times, the test sets can overlap. 
+
+Note: the two methods uses "stratified fold" (that why "stratified" appears in both names). It means each part preserves the same percentage of samples of each class (label) as the original data. You can read more at cross_validation documents
+
+
+''' python code '''
+splits = 5
+
+tx = range(10)
+ty = [0] * 5 + [1] * 5
+
+from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
+from sklearn import datasets
+
+kfold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=42)
+shufflesplit = StratifiedShuffleSplit(n_splits=splits, random_state=42, test_size=2)
+
+print("KFold")
+for train_index, test_index in kfold.split(tx, ty):
+    print("TRAIN:", train_index, "TEST:", test_index)
+
+print("Shuffle Split")
+for train_index, test_index in shufflesplit.split(tx, ty):
+    print("TRAIN:", train_index, "TEST:", test_index)
+
+'''
+Output:
+
+KFold
+TRAIN: [0 2 3 4 5 6 7 9] TEST: [1 8]
+TRAIN: [0 1 2 3 5 7 8 9] TEST: [4 6]
+TRAIN: [0 1 3 4 5 6 8 9] TEST: [2 7]
+TRAIN: [1 2 3 4 6 7 8 9] TEST: [0 5]
+TRAIN: [0 1 2 4 5 6 7 8] TEST: [3 9]
+
+Shuffle Split
+TRAIN: [8 4 1 0 6 5 7 2] TEST: [3 9]
+TRAIN: [7 0 3 9 4 5 1 6] TEST: [8 2]
+TRAIN: [1 2 5 6 4 8 9 0] TEST: [3 7]
+TRAIN: [4 6 7 8 3 5 1 2] TEST: [9 0]
+TRAIN: [7 2 6 5 4 3 0 9] TEST: [1 8]
--- a/pycache/MultClassPipe2.cpython-37.pyc
+++ b/pycache/MultClassPipe2.cpython-37.pyc
--- a/pycache/MultClassPipe3.cpython-37.pyc
+++ b/pycache/MultClassPipe3.cpython-37.pyc
--- a/pycache/loopity_loop.cpython-37.pyc
+++ b/pycache/loopity_loop.cpython-37.pyc
--- a/29
+++ b/29
@ -0,0 +1,29 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Mar 10 10:59:36 2022
+
+@author: tanu
+"""
+# numerical
+#log_reg  (rs)
+F1_score     0.713380
+MCC          0.376546
+Precision    0.687628
+Recall       0.747231
+Accuracy     0.687293
+ROC_curve    0.683199
+#log_reg  (balanced)
+F1_score     0.715106
+MCC          0.390225
+Precision    0.702629
+Recall       0.733445
+Accuracy     0.694309
+ROC_curve    0.691555
+#log_reg  (unbalanced)
+F1_score     0.713380
+MCC          0.376546
+Precision    0.687628
+Recall       0.747231
+Accuracy     0.687293
+ROC_curve    0.683199
--- a/imports.py
+++ b/imports.py
@ -50,10 +50,10 @@ os.chdir(homedir + "/git/ML_AI_training/")
 # my function
 from MultClassPipe import MultClassPipeline
 from MultClassPipe2 import MultClassPipeline2
-from MultClassPipe3 import MultClassPipeSKF
+from loopity_loop import MultClassPipeSKF

-gene = 'pncA'
-drug = 'pyrazinamide'
+gene = 'rpoB'
+drug = 'rifampicin'

 #==============
 # directories
@ -82,12 +82,19 @@ mycols = my_df.columns
 my_df['active_aa_pos'].dtype
 my_df['active_aa_pos'] = my_df['active_aa_pos'].astype(object)

+if gene.lower() in geneL_na_ppi2:
+    x_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] 
+    #D1148 get rid of
+    na_index = my_df['mutationinformation'].index[my_df['mcsm_na_affinity'].apply(np.isnan)]
+    my_df = my_df.drop(index=na_index)
+
 #%%============================================================================
 # GET Y

 # Target1: mutation_info_labels
 dm_om_map = {'DM': 1, 'OM': 0}
 target1 = my_df['mutation_info_labels'].map(dm_om_map)
+target1.value_counts()

 # Target2: drug
 drug_labels = drug + '_labels'
--- a/loopity_detangle.py
+++ b/loopity_detangle.py
@ -0,0 +1,82 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Mar 10 18:06:34 2022
+
+@author: tanu
+"""
+
+#%%
+models = [
+        ('Logistic Regression'  , log_reg) 
+        , ('K-Nearest Neighbors', knn) 
+        ]
+
+classification_metrics = {
+    'F1_score': []
+    ,'MCC': []
+    ,'Precision': []
+    ,'Recall': []
+    ,'Accuracy': []
+    ,'ROC_curve': []
+    }
+
+folds=[1,2]
+fold_no=1
+fold_dict={}
+for model_name, model in models:
+    fold_dict.update({model_name: {}})
+
+for f in folds:
+    fold=("fold_"+str(fold_no))
+    for model_name, model in models:
+        print("start of model", model_name, "fold: ", fold)
+        fold_dict[model_name].update({fold: {}})
+        fold_dict[model_name][fold].update(classification_metrics)
+
+        print("end of model", model_name, "fold: ", fold)
+        fold_dict[model_name][fold].update({'F1_score': random.randrange(1,10)})
+    fold_no +=1
+    pp.pprint(fold_dict)
+    
+
+#%%
+folds_f1=[]
+
+for model_name, model in models:
+    print("Calculating mean for F1_score for: ", model_name)
+    #for key in fold_dict['Logistic Regression']:
+        # wrap this in a classification_metric for loop
+    for key in fold_dict[model_name]:
+        folds_f1.append(fold_dict['Logistic Regression'][key]['F1_score'])
+        #folds_f1.append(folds_f1)
+        print('key:', key, 'F1scores:', folds_f1)
+mean(folds_f1)
+#%%
+scores_df = pd.DataFrame(columns=['Model', 'F1_Score', 'MCC', 'Precision', 'Recall', 'Accuracy', 'ROC_AUC'])
+
+# manually
+model_name = 'Logistic Regression'
+model_metric = 'F1_score'
+
+log_reg_f1 = []
+for key in fold_dict[model_name]:
+    log_reg_f1.append(fold_dict[model_name][key][model_metric])
+    log_reg_f1M = mean(log_reg_f1)
+    print('key:', key, model_metric, ':', log_reg_f1)
+print(log_reg_f1M)
+
+log_reg_f1df = pd.DataFrame({model_name: [log_reg_f1M]}, index = [model_metric])
+log_reg_f1df
+
+#%%
+model_metric = 'MCC'
+log_reg_mcc = []
+for key in fold_dict[model_name]:
+    log_reg_mcc.append(fold_dict[model_name][key][model_metric])
+    log_reg_mccM = mean(log_reg_mcc)
+    print('key:', key, model_metric, ':', log_reg_mcc)
+print(log_reg_mccM)
+
+log_reg_mccdf = pd.DataFrame({model_name: [log_reg_mccM]}, index = [model_metric])
+log_reg_mccdf
--- a/loopity_loop.py
+++ b/loopity_loop.py
@ -97,13 +97,13 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical'
        ,'Precision': []
        ,'Recall': []
        ,'Accuracy': []
-        ,'ROC_curve': []
+        #,'ROC_AUC': []
        }
    models = [
            ('Logistic Regression'  , log_reg) 
-            #, ('Naive Bayes'        , nb)
+            , ('Naive Bayes'        , nb)
            , ('K-Nearest Neighbors', knn) 
-            # , ('SVM'                , svm) 
+             , ('SVM'                , svm) 
            # , ('MLP'                , mlp) 
            # , ('Decision Tree'      , dt) 
            # , ('Extra Trees'        , et) 
@ -132,10 +132,8 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical'
        y_train_fold, y_test_fold = y_targetF.iloc[train_index], y_targetF.iloc[test_index]
        #print("Fold: ", fold_no, len(train_index), len(test_index))

-        # for keys in skf_dict:
-
        for model_name, model in models:
-            print("start of model", model_name, " loop", fold_no)
+            print("\nStart of model", model_name, "\nLoop no.", fold_no)
            #skf_dict.update({model_name: classification_metrics })
            model_pipeline = Pipeline(steps=[('prep'         , col_transform)
                                              , ('classifier' , model)])
@ -145,28 +143,39 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical'
            #----------------
            # Model metrics
            #----------------     
-            score=f1_score(y_test_fold, y_pred_fold)
-            mcc = matthews_corrcoef(y_test_fold, y_pred_fold)
+            fscore  = f1_score(y_test_fold, y_pred_fold)
+            mcc     = matthews_corrcoef(y_test_fold, y_pred_fold)
+            #pres    = precision_score(y_test_fold, y_pred_fold)
+            #recall  = recall_score(y_test_fold, y_pred_fold)
+            pres    = precision_score(y_test_fold, y_pred_fold, zero_division=0)
+            recall  = recall_score(y_test_fold, y_pred_fold, zero_division=0)
+            accu    = accuracy_score(y_test_fold, y_pred_fold)
+            #roc_auc = roc_auc_score(y_test_fold, y_pred_fold)
            
            fold=("fold_"+str(fold_no))
            
            fold_dict[model_name].update({fold: {}})
-            pp.pprint(fold_dict)
-            print("end of model", model_name, " loop", fold_no)
+            #pp.pprint(fold_dict)
+            print("\nEnd of model", model_name, "\nLoop no.", fold_no)
            
            fold_dict[model_name][fold].update(classification_metrics)
            #fold_dict[model_name][fold]['F1_score'].append(score)
-            fold_dict[model_name][fold].update({'F1_score': score})
-            fold_dict[model_name][fold].update({'MCC': mcc})
-
+            fold_dict[model_name][fold].update({'F1_score'  : fscore})
+            fold_dict[model_name][fold].update({'MCC'       : mcc})
+            fold_dict[model_name][fold].update({'Precision' : pres})
+            fold_dict[model_name][fold].update({'Recall'    : recall})
+            fold_dict[model_name][fold].update({'Accuracy'  : accu})
+            #fold_dict[model_name][fold].update({'ROC_AUC'   : roc_auc})
+            
        fold_no +=1
        #pp.pprint(skf_dict)

    return(fold_dict)

-t3_res = MultClassPipeSKF(input_df = numerical_features_df
-                          , y_targetF = target1
-                          , var_type = 'numerical'
-                          , skf_splits = 10)
-#pp.pprint(t3_res)
-#print(t3_res)
+#%% CAll function 
+# t3_res = MultClassPipeSKF(input_df = numerical_features_df
+#                           , y_targetF = target1
+#                           , var_type = 'numerical'
+#                           , skf_splits = 10)
+# pp.pprint(t3_res)
+# #print(t3_res)
--- a/90
+++ b/90
@ -0,0 +1,90 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Mar 11 11:15:50 2022
+
+@author: tanu
+"""
+#%%
+del(t3_res)
+t3_res = MultClassPipeSKF(input_df = numerical_features_df
+                          , y_targetF = target1
+                          , var_type = 'numerical'
+                          , skf_splits = 10)
+pp.pprint(t3_res)
+#print(t3_res)
+
+#%% Manually: mean for each model, each metric
+model_name = 'Logistic Regression'
+model_name = 'Naive Bayes'
+model_name = 'K-Nearest Neighbors'
+model_name = 'SVM'
+
+#%%
+model_metric = 'F1_score'
+
+log_reg_f1 = []
+for key in t3_res[model_name]:
+    log_reg_f1.append(t3_res[model_name][key][model_metric])
+    log_reg_f1M = mean(log_reg_f1)
+    print('key:', key, model_metric, ':', log_reg_f1)
+print(log_reg_f1M)
+
+log_reg_f1df = pd.DataFrame({model_name: [log_reg_f1M]}, index = [model_metric])
+log_reg_f1df
+
+#%%
+model_metric = 'MCC'
+log_reg_mcc = []
+for key in t3_res[model_name]:
+    log_reg_mcc.append(t3_res[model_name][key][model_metric])
+    log_reg_mccM = mean(log_reg_mcc)
+    print('key:', key, model_metric, ':', log_reg_mcc)
+print(log_reg_mccM)
+
+log_reg_mccdf = pd.DataFrame({model_name: [log_reg_mccM]}, index = [model_metric])
+log_reg_mccdf
+#%%
+model_metric = 'Precision'
+log_reg_pres = []
+for key in t3_res[model_name]:
+    log_reg_pres.append(t3_res[model_name][key][model_metric])
+    log_reg_presM = mean(log_reg_pres)
+    print('key:', key, model_metric, ':', log_reg_pres)
+print(log_reg_presM)
+
+log_reg_presdf = pd.DataFrame({model_name: [log_reg_presM]}, index = [model_metric])
+log_reg_presdf
+#%%
+model_metric = 'Recall'
+log_reg_recall = []
+for key in t3_res[model_name]:
+    log_reg_recall.append(t3_res[model_name][key][model_metric])
+    log_reg_recallM = mean(log_reg_recall)
+    print('key:', key, model_metric, ':', log_reg_recall)
+print(log_reg_recallM)
+
+log_reg_recalldf = pd.DataFrame({model_name: [log_reg_recallM]}, index = [model_metric])
+log_reg_recalldf
+#%%
+model_metric = 'Accuracy'
+log_reg_accu = []
+for key in t3_res[model_name]:
+    log_reg_accu.append(t3_res[model_name][key][model_metric])
+    log_reg_accuM = mean(log_reg_accu)
+    print('key:', key, model_metric, ':', log_reg_accu)
+print(log_reg_accuM)
+
+log_reg_accudf = pd.DataFrame({model_name: [log_reg_accuM]}, index = [model_metric])
+log_reg_accudf
+#%% 
+model_metric = 'ROC_AUC'
+log_reg_roc_auc = []
+for key in t3_res[model_name]:
+    log_reg_roc_auc.append(t3_res[model_name][key][model_metric])
+    log_reg_roc_aucM = mean(log_reg_roc_auc)
+    print('key:', key, model_metric, ':', log_reg_roc_auc)
+print(log_reg_roc_aucM)
+
+log_reg_roc_aucdf = pd.DataFrame({model_name: [log_reg_roc_aucM]}, index = [model_metric])
+log_reg_roc_aucdf
--- a/my_data_modelpipe.py
+++ b/my_data_modelpipe.py
--- a/my_datap10.py
+++ b/my_datap10.py
--- a/my_datap11.py
+++ b/my_datap11.py
--- a/my_datap5.py
+++ b/my_datap5.py
--- a/my_datap6.py
+++ b/my_datap6.py
--- a/my_datap7.py
+++ b/my_datap7.py
--- a/my_datap8.py
+++ b/my_datap8.py
--- a/my_datap9.py
+++ b/my_datap9.py
--- a/untitled21.py
+++ b/untitled21.py
@ -1,40 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Thu Mar 10 18:06:34 2022
-
-@author: tanu
-"""
-models = [
-        ('Logistic Regression'  , log_reg) 
-        , ('K-Nearest Neighbors', knn) 
-        ]
-
-classification_metrics = {
-    'F1_score': []
-    ,'MCC': []
-    ,'Precision': []
-    ,'Recall': []
-    ,'Accuracy': []
-    ,'ROC_curve': []
-    }
-
-folds=[1,2]
-fold_no=1
-fold_dict={}
-for model_name, model in models:
-    fold_dict.update({model_name: {}})
-
-for f in folds:
-    fold=("fold_"+str(fold_no))
-    for model_name, model in models:
-        print("start of model", model_name, "fold: ", fold)
-        fold_dict[model_name].update({fold: {}})
-        fold_dict[model_name][fold].update(classification_metrics)
-
-        print("end of model", model_name, "fold: ", fold)
-        fold_dict[model_name][fold].update({'F1_score': random.randrange(1,10)})
-    fold_no +=1
-    pp.pprint(fold_dict)
-    
-