diff --git a/SKF_SSF.txt b/SKF_SSF.txt
new file mode 100644
index 0000000..77f45e1
--- /dev/null
+++ b/SKF_SSF.txt
@@ -0,0 +1,48 @@
+# Stratified K-fold vs ShuffleSplit
+
+https://stackoverflow.com/questions/45969390/difference-between-stratifiedkfold-and-stratifiedshufflesplit-in-sklearn
+
+In ShuffleSplit, the data is shuffled every time, and then split. This means the test sets may overlap between the splits.
+In SKF, test sets don't overlap
+
+So, the difference here is that StratifiedKFold just shuffles and splits once, therefore the test sets do not overlap, while StratifiedShuffleSplit shuffles each time before splitting, and it splits n_splits times, the test sets can overlap. 
+
+Note: the two methods uses "stratified fold" (that why "stratified" appears in both names). It means each part preserves the same percentage of samples of each class (label) as the original data. You can read more at cross_validation documents
+
+
+''' python code '''
+splits = 5
+
+tx = range(10)
+ty = [0] * 5 + [1] * 5
+
+from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
+from sklearn import datasets
+
+kfold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=42)
+shufflesplit = StratifiedShuffleSplit(n_splits=splits, random_state=42, test_size=2)
+
+print("KFold")
+for train_index, test_index in kfold.split(tx, ty):
+    print("TRAIN:", train_index, "TEST:", test_index)
+
+print("Shuffle Split")
+for train_index, test_index in shufflesplit.split(tx, ty):
+    print("TRAIN:", train_index, "TEST:", test_index)
+
+'''
+Output:
+
+KFold
+TRAIN: [0 2 3 4 5 6 7 9] TEST: [1 8]
+TRAIN: [0 1 2 3 5 7 8 9] TEST: [4 6]
+TRAIN: [0 1 3 4 5 6 8 9] TEST: [2 7]
+TRAIN: [1 2 3 4 6 7 8 9] TEST: [0 5]
+TRAIN: [0 1 2 4 5 6 7 8] TEST: [3 9]
+
+Shuffle Split
+TRAIN: [8 4 1 0 6 5 7 2] TEST: [3 9]
+TRAIN: [7 0 3 9 4 5 1 6] TEST: [8 2]
+TRAIN: [1 2 5 6 4 8 9 0] TEST: [3 7]
+TRAIN: [4 6 7 8 3 5 1 2] TEST: [9 0]
+TRAIN: [7 2 6 5 4 3 0 9] TEST: [1 8]
diff --git a/__pycache__/MultClassPipe2.cpython-37.pyc b/__pycache__/MultClassPipe2.cpython-37.pyc
new file mode 100644
index 0000000..1cb8e8b
Binary files /dev/null and b/__pycache__/MultClassPipe2.cpython-37.pyc differ
diff --git a/__pycache__/MultClassPipe3.cpython-37.pyc b/__pycache__/MultClassPipe3.cpython-37.pyc
new file mode 100644
index 0000000..2777a29
Binary files /dev/null and b/__pycache__/MultClassPipe3.cpython-37.pyc differ
diff --git a/__pycache__/loopity_loop.cpython-37.pyc b/__pycache__/loopity_loop.cpython-37.pyc
new file mode 100644
index 0000000..effcb8e
Binary files /dev/null and b/__pycache__/loopity_loop.cpython-37.pyc differ
diff --git a/comp_results b/comp_results
new file mode 100644
index 0000000..9fd5e70
--- /dev/null
+++ b/comp_results
@@ -0,0 +1,29 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Mar 10 10:59:36 2022
+
+@author: tanu
+"""
+# numerical
+#log_reg  (rs)
+F1_score     0.713380
+MCC          0.376546
+Precision    0.687628
+Recall       0.747231
+Accuracy     0.687293
+ROC_curve    0.683199
+#log_reg  (balanced)
+F1_score     0.715106
+MCC          0.390225
+Precision    0.702629
+Recall       0.733445
+Accuracy     0.694309
+ROC_curve    0.691555
+#log_reg  (unbalanced)
+F1_score     0.713380
+MCC          0.376546
+Precision    0.687628
+Recall       0.747231
+Accuracy     0.687293
+ROC_curve    0.683199
\ No newline at end of file
diff --git a/imports.py b/imports.py
index 2eaf070..ab3606c 100644
--- a/imports.py
+++ b/imports.py
@@ -50,10 +50,10 @@ os.chdir(homedir + "/git/ML_AI_training/")
 # my function
 from MultClassPipe import MultClassPipeline
 from MultClassPipe2 import MultClassPipeline2
-from MultClassPipe3 import MultClassPipeSKF
+from loopity_loop import MultClassPipeSKF
 
-gene = 'pncA'
-drug = 'pyrazinamide'
+gene = 'rpoB'
+drug = 'rifampicin'
 
 #==============
 # directories
@@ -82,12 +82,19 @@ mycols = my_df.columns
 my_df['active_aa_pos'].dtype
 my_df['active_aa_pos'] = my_df['active_aa_pos'].astype(object)
 
+if gene.lower() in geneL_na_ppi2:
+    x_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] 
+    #D1148 get rid of
+    na_index = my_df['mutationinformation'].index[my_df['mcsm_na_affinity'].apply(np.isnan)]
+    my_df = my_df.drop(index=na_index)
+
 #%%============================================================================
 # GET Y
 
 # Target1: mutation_info_labels
 dm_om_map = {'DM': 1, 'OM': 0}
 target1 = my_df['mutation_info_labels'].map(dm_om_map)
+target1.value_counts()
 
 # Target2: drug
 drug_labels = drug + '_labels'
diff --git a/loopity_detangle.py b/loopity_detangle.py
new file mode 100644
index 0000000..56f6999
--- /dev/null
+++ b/loopity_detangle.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Mar 10 18:06:34 2022
+
+@author: tanu
+"""
+
+#%%
+models = [
+        ('Logistic Regression'  , log_reg) 
+        , ('K-Nearest Neighbors', knn) 
+        ]
+
+classification_metrics = {
+    'F1_score': []
+    ,'MCC': []
+    ,'Precision': []
+    ,'Recall': []
+    ,'Accuracy': []
+    ,'ROC_curve': []
+    }
+
+folds=[1,2]
+fold_no=1
+fold_dict={}
+for model_name, model in models:
+    fold_dict.update({model_name: {}})
+
+for f in folds:
+    fold=("fold_"+str(fold_no))
+    for model_name, model in models:
+        print("start of model", model_name, "fold: ", fold)
+        fold_dict[model_name].update({fold: {}})
+        fold_dict[model_name][fold].update(classification_metrics)
+
+        print("end of model", model_name, "fold: ", fold)
+        fold_dict[model_name][fold].update({'F1_score': random.randrange(1,10)})
+    fold_no +=1
+    pp.pprint(fold_dict)
+    
+
+#%%
+folds_f1=[]
+
+for model_name, model in models:
+    print("Calculating mean for F1_score for: ", model_name)
+    #for key in fold_dict['Logistic Regression']:
+        # wrap this in a classification_metric for loop
+    for key in fold_dict[model_name]:
+        folds_f1.append(fold_dict['Logistic Regression'][key]['F1_score'])
+        #folds_f1.append(folds_f1)
+        print('key:', key, 'F1scores:', folds_f1)
+mean(folds_f1)
+#%%
+scores_df = pd.DataFrame(columns=['Model', 'F1_Score', 'MCC', 'Precision', 'Recall', 'Accuracy', 'ROC_AUC'])
+
+# manually
+model_name = 'Logistic Regression'
+model_metric = 'F1_score'
+
+log_reg_f1 = []
+for key in fold_dict[model_name]:
+    log_reg_f1.append(fold_dict[model_name][key][model_metric])
+    log_reg_f1M = mean(log_reg_f1)
+    print('key:', key, model_metric, ':', log_reg_f1)
+print(log_reg_f1M)
+
+log_reg_f1df = pd.DataFrame({model_name: [log_reg_f1M]}, index = [model_metric])
+log_reg_f1df
+
+#%%
+model_metric = 'MCC'
+log_reg_mcc = []
+for key in fold_dict[model_name]:
+    log_reg_mcc.append(fold_dict[model_name][key][model_metric])
+    log_reg_mccM = mean(log_reg_mcc)
+    print('key:', key, model_metric, ':', log_reg_mcc)
+print(log_reg_mccM)
+
+log_reg_mccdf = pd.DataFrame({model_name: [log_reg_mccM]}, index = [model_metric])
+log_reg_mccdf
\ No newline at end of file
diff --git a/loopity_loop.py b/loopity_loop.py
index 936cc6d..17fd851 100644
--- a/loopity_loop.py
+++ b/loopity_loop.py
@@ -97,13 +97,13 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical'
         ,'Precision': []
         ,'Recall': []
         ,'Accuracy': []
-        ,'ROC_curve': []
+        #,'ROC_AUC': []
         }
     models = [
             ('Logistic Regression'  , log_reg) 
-            #, ('Naive Bayes'        , nb)
+            , ('Naive Bayes'        , nb)
             , ('K-Nearest Neighbors', knn) 
-            # , ('SVM'                , svm) 
+             , ('SVM'                , svm) 
             # , ('MLP'                , mlp) 
             # , ('Decision Tree'      , dt) 
             # , ('Extra Trees'        , et) 
@@ -132,10 +132,8 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical'
         y_train_fold, y_test_fold = y_targetF.iloc[train_index], y_targetF.iloc[test_index]
         #print("Fold: ", fold_no, len(train_index), len(test_index))
 
-        # for keys in skf_dict:
-
         for model_name, model in models:
-            print("start of model", model_name, " loop", fold_no)
+            print("\nStart of model", model_name, "\nLoop no.", fold_no)
             #skf_dict.update({model_name: classification_metrics })
             model_pipeline = Pipeline(steps=[('prep'         , col_transform)
                                               , ('classifier' , model)])
@@ -145,28 +143,39 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical'
             #----------------
             # Model metrics
             #----------------     
-            score=f1_score(y_test_fold, y_pred_fold)
-            mcc = matthews_corrcoef(y_test_fold, y_pred_fold)
+            fscore  = f1_score(y_test_fold, y_pred_fold)
+            mcc     = matthews_corrcoef(y_test_fold, y_pred_fold)
+            #pres    = precision_score(y_test_fold, y_pred_fold)
+            #recall  = recall_score(y_test_fold, y_pred_fold)
+            pres    = precision_score(y_test_fold, y_pred_fold, zero_division=0)
+            recall  = recall_score(y_test_fold, y_pred_fold, zero_division=0)
+            accu    = accuracy_score(y_test_fold, y_pred_fold)
+            #roc_auc = roc_auc_score(y_test_fold, y_pred_fold)
             
             fold=("fold_"+str(fold_no))
             
             fold_dict[model_name].update({fold: {}})
-            pp.pprint(fold_dict)
-            print("end of model", model_name, " loop", fold_no)
+            #pp.pprint(fold_dict)
+            print("\nEnd of model", model_name, "\nLoop no.", fold_no)
             
             fold_dict[model_name][fold].update(classification_metrics)
             #fold_dict[model_name][fold]['F1_score'].append(score)
-            fold_dict[model_name][fold].update({'F1_score': score})
-            fold_dict[model_name][fold].update({'MCC': mcc})
-
+            fold_dict[model_name][fold].update({'F1_score'  : fscore})
+            fold_dict[model_name][fold].update({'MCC'       : mcc})
+            fold_dict[model_name][fold].update({'Precision' : pres})
+            fold_dict[model_name][fold].update({'Recall'    : recall})
+            fold_dict[model_name][fold].update({'Accuracy'  : accu})
+            #fold_dict[model_name][fold].update({'ROC_AUC'   : roc_auc})
+            
         fold_no +=1
         #pp.pprint(skf_dict)
 
     return(fold_dict)
 
-t3_res = MultClassPipeSKF(input_df = numerical_features_df
-                          , y_targetF = target1
-                          , var_type = 'numerical'
-                          , skf_splits = 10)
-#pp.pprint(t3_res)
-#print(t3_res)
+#%% CAll function 
+# t3_res = MultClassPipeSKF(input_df = numerical_features_df
+#                           , y_targetF = target1
+#                           , var_type = 'numerical'
+#                           , skf_splits = 10)
+# pp.pprint(t3_res)
+# #print(t3_res)
diff --git a/loopity_loop_CALL b/loopity_loop_CALL
new file mode 100644
index 0000000..4916d2b
--- /dev/null
+++ b/loopity_loop_CALL
@@ -0,0 +1,90 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Mar 11 11:15:50 2022
+
+@author: tanu
+"""
+#%%
+del(t3_res)
+t3_res = MultClassPipeSKF(input_df = numerical_features_df
+                          , y_targetF = target1
+                          , var_type = 'numerical'
+                          , skf_splits = 10)
+pp.pprint(t3_res)
+#print(t3_res)
+
+#%% Manually: mean for each model, each metric
+model_name = 'Logistic Regression'
+model_name = 'Naive Bayes'
+model_name = 'K-Nearest Neighbors'
+model_name = 'SVM'
+
+#%%
+model_metric = 'F1_score'
+
+log_reg_f1 = []
+for key in t3_res[model_name]:
+    log_reg_f1.append(t3_res[model_name][key][model_metric])
+    log_reg_f1M = mean(log_reg_f1)
+    print('key:', key, model_metric, ':', log_reg_f1)
+print(log_reg_f1M)
+
+log_reg_f1df = pd.DataFrame({model_name: [log_reg_f1M]}, index = [model_metric])
+log_reg_f1df
+
+#%%
+model_metric = 'MCC'
+log_reg_mcc = []
+for key in t3_res[model_name]:
+    log_reg_mcc.append(t3_res[model_name][key][model_metric])
+    log_reg_mccM = mean(log_reg_mcc)
+    print('key:', key, model_metric, ':', log_reg_mcc)
+print(log_reg_mccM)
+
+log_reg_mccdf = pd.DataFrame({model_name: [log_reg_mccM]}, index = [model_metric])
+log_reg_mccdf
+#%%
+model_metric = 'Precision'
+log_reg_pres = []
+for key in t3_res[model_name]:
+    log_reg_pres.append(t3_res[model_name][key][model_metric])
+    log_reg_presM = mean(log_reg_pres)
+    print('key:', key, model_metric, ':', log_reg_pres)
+print(log_reg_presM)
+
+log_reg_presdf = pd.DataFrame({model_name: [log_reg_presM]}, index = [model_metric])
+log_reg_presdf
+#%%
+model_metric = 'Recall'
+log_reg_recall = []
+for key in t3_res[model_name]:
+    log_reg_recall.append(t3_res[model_name][key][model_metric])
+    log_reg_recallM = mean(log_reg_recall)
+    print('key:', key, model_metric, ':', log_reg_recall)
+print(log_reg_recallM)
+
+log_reg_recalldf = pd.DataFrame({model_name: [log_reg_recallM]}, index = [model_metric])
+log_reg_recalldf
+#%%
+model_metric = 'Accuracy'
+log_reg_accu = []
+for key in t3_res[model_name]:
+    log_reg_accu.append(t3_res[model_name][key][model_metric])
+    log_reg_accuM = mean(log_reg_accu)
+    print('key:', key, model_metric, ':', log_reg_accu)
+print(log_reg_accuM)
+
+log_reg_accudf = pd.DataFrame({model_name: [log_reg_accuM]}, index = [model_metric])
+log_reg_accudf
+#%% 
+model_metric = 'ROC_AUC'
+log_reg_roc_auc = []
+for key in t3_res[model_name]:
+    log_reg_roc_auc.append(t3_res[model_name][key][model_metric])
+    log_reg_roc_aucM = mean(log_reg_roc_auc)
+    print('key:', key, model_metric, ':', log_reg_roc_auc)
+print(log_reg_roc_aucM)
+
+log_reg_roc_aucdf = pd.DataFrame({model_name: [log_reg_roc_aucM]}, index = [model_metric])
+log_reg_roc_aucdf
\ No newline at end of file
diff --git a/my_data_gid.py b/my_data_modelpipe.py
similarity index 100%
rename from my_data_gid.py
rename to my_data_modelpipe.py
diff --git a/my_data10.py b/my_datap10.py
similarity index 100%
rename from my_data10.py
rename to my_datap10.py
diff --git a/my_data11.py b/my_datap11.py
similarity index 100%
rename from my_data11.py
rename to my_datap11.py
diff --git a/my_data5.py b/my_datap5.py
similarity index 100%
rename from my_data5.py
rename to my_datap5.py
diff --git a/my_data6.py b/my_datap6.py
similarity index 100%
rename from my_data6.py
rename to my_datap6.py
diff --git a/my_data7.py b/my_datap7.py
similarity index 100%
rename from my_data7.py
rename to my_datap7.py
diff --git a/my_data8.py b/my_datap8.py
similarity index 100%
rename from my_data8.py
rename to my_datap8.py
diff --git a/my_data9.py b/my_datap9.py
similarity index 100%
rename from my_data9.py
rename to my_datap9.py
diff --git a/untitled21.py b/untitled21.py
deleted file mode 100644
index a3b23be..0000000
--- a/untitled21.py
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Thu Mar 10 18:06:34 2022
-
-@author: tanu
-"""
-models = [
-        ('Logistic Regression'  , log_reg) 
-        , ('K-Nearest Neighbors', knn) 
-        ]
-
-classification_metrics = {
-    'F1_score': []
-    ,'MCC': []
-    ,'Precision': []
-    ,'Recall': []
-    ,'Accuracy': []
-    ,'ROC_curve': []
-    }
-
-folds=[1,2]
-fold_no=1
-fold_dict={}
-for model_name, model in models:
-    fold_dict.update({model_name: {}})
-
-for f in folds:
-    fold=("fold_"+str(fold_no))
-    for model_name, model in models:
-        print("start of model", model_name, "fold: ", fold)
-        fold_dict[model_name].update({fold: {}})
-        fold_dict[model_name][fold].update(classification_metrics)
-
-        print("end of model", model_name, "fold: ", fold)
-        fold_dict[model_name][fold].update({'F1_score': random.randrange(1,10)})
-    fold_no +=1
-    pp.pprint(fold_dict)
-    
- 
\ No newline at end of file