renamed hyperparams to gscv

2022-03-22 11:08:20 +00:00 · 2022-03-22 11:08:20 +00:00 · ad5ebad7f8
commit ad5ebad7f8
parent a82358dbb4
31 changed files with 4433 additions and 0 deletions
--- a/pycache/lazypredict.cpython-37.pyc
+++ b/pycache/lazypredict.cpython-37.pyc
--- a/classification_names_params.py
+++ b/classification_names_params.py
@ -0,0 +1,264 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Mar 18 09:47:48 2022
+
+@author: tanu
+"""
+
+#%% Useful links
+# https://stackoverflow.com/questions/41844311/list-of-all-classification-algorithms
+# https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html
+# https://github.com/davidsbatista/machine-learning-notebooks/blob/master/hyperparameter-across-models.ipynb
+# https://scikit-learn.org/stable/modules/svm.html#classification
+# https://machinelearningmastery.com/hyperparameters-for-classification-machine-learning-algorithms/ # [params]
+# https://uk.mathworks.com/help/stats/hyperparameter-optimization-in-classification-learner-app.html [ algo]
+# As a general rule of thumb, it is required to run baseline models on the dataset. I know H2O- AutoML and other AutoML packages do this. But I want to try using Scikit-learn Pipeline,
+    #  https://codereview.stackexchange.com/questions/256934/model-pipeline-to-run-multiple-classifiers-for-ml-classification
+# https://uk.mathworks.com/help/stats/hyperparameter-optimization-in-classification-learner-app.html
+# QDA: https://www.geeksforgeeks.org/quadratic-discriminant-analysis/
+
+names = [
+    "Nearest Neighbors",
+    "Linear SVM",
+    "RBF SVM",
+    "Gaussian Process",
+    "Decision Tree",
+    "Random Forest",
+    "Neural Net",
+    "AdaBoost",
+    "Naive Bayes",
+    "QDA",
+]
+
+classifiers = [
+    KNeighborsClassifier(3),
+    SVC(kernel="linear", C=0.025),
+    SVC(gamma=2, C=1),
+    GaussianProcessClassifier(1.0 * RBF(1.0)),
+    DecisionTreeClassifier(max_depth=5),
+    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
+    MLPClassifier(alpha=1, max_iter=1000),
+    AdaBoostClassifier(),
+    GaussianNB(),
+    QuadraticDiscriminantAnalysis(),
+]
+
+# NOTE Logistic regression
+# The choice of the algorithm depends on the penalty chosen: Supported penalties by solver:
+#     ‘newton-cg’ - [‘l2’, ‘none’]
+#     ‘lbfgs’ - [‘l2’, ‘none’]
+#     ‘liblinear’ - [‘l1’, ‘l2’]
+#     ‘sag’ - [‘l2’, ‘none’]
+#     ‘saga’ - [‘elasticnet’, ‘l1’, ‘l2’, ‘none’]
+
+# SVR?
+# estimator=SVR(kernel='rbf')
+# param_grid={
+#             'C': [1.1, 5.4, 170, 1001],
+#             'epsilon': [0.0003, 0.007, 0.0109, 0.019, 0.14, 0.05, 8, 0.2, 3, 2, 7],
+#             'gamma': [0.7001, 0.008, 0.001, 3.1, 1, 1.3, 5]
+#         }
+
+
+#%% Classification algorithms param grid
+#%% LogisticRegression()
+#https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
+    gs_lr = Pipeline((
+    ('pre' , MinMaxScaler())
+    ,('clf', LogisticRegression(**rs
+                                , **njobs))
+    ))
+    gs_lr_params = {
+        'clf__C'        : [0.0001, 0.001, 0.01, 0.1 ,1, 10, 100]
+        #'C': np.logspace(-4, 4, 50)
+        , 'clf__penalty': ['l1', 'l2', 'elasticnet', 'none']
+        , 'clf__solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
+        }      
+#%% DecisionTreeClassifier()
+
+    gs_dt = Pipeline((
+    ('pre'  , MinMaxScaler())
+    , ('clf', DecisionTreeClassifier(**rs
+                                     , **njobs))
+    ))
+    gs_dt_params = {
+            'clf__max_depth': [ 2,  4,  6, 8, 10]
+            , 'clf__criterion':['gini','entropy']
+            , "clf__max_features":["auto", None]
+            , "clf__max_leaf_nodes":[10,20,30,40]
+            }    
+#%% KNeighborsClassifier()
+#https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
+    gs_knn = Pipeline((
+    ('pre' , MinMaxScaler())
+    ,('clf', KNeighborsClassifier(**rs
+                                  , **njobs))
+    ))
+    
+    gs_knn_params = {
+    'clf__n_neighbors': [3, 7, 10]
+    #, 'clf__n_neighbors': range(1, 21, 2)
+    ,'clf__metric'     : ['euclidean', 'manhattan', 'minkowski']
+    , 'clf__weights'    : ['uniform', 'distance']
+    }
+#%% RandomForestClassifier()
+
+    gs_rf = Pipeline((
+    ('pre' , MinMaxScaler())
+    ,('clf', RandomForestClassifier(**rs
+                                    , **njobs
+                                    , bootstrap = True
+                                    , oob_score = True))
+    ))
+    gs_rf_params = {
+        'clf__max_depth': [4, 6, 8, 10, 12, 16, 20, None]
+        , 'clf__class_weight':['balanced','balanced_subsample']
+        , 'clf__n_estimators': [10, 100, 1000]
+        , 'clf__criterion': ['gini', 'entropy']
+        , 'clf__max_features': ['auto', 'sqrt']
+        , 'clf__min_samples_leaf': [2, 4, 8, 50]
+        , 'clf__min_samples_split': [10, 20]
+        }
+#%% XGBClassifier()
+# https://stackoverflow.com/questions/34674797/xgboost-xgbclassifier-defaults-in-python
+# https://stackoverflow.com/questions/34674797/xgboost-xgbclassifier-defaults-in-python
+    gs_xgb = Pipeline((
+    ('pre' , MinMaxScaler())
+    ,('clf', XGBClassifier(**rs
+                           , **njobs))
+    ))
+    
+    gs_xgb_params = {
+        'clf__learning_rate': [0.01, 0.05, 0.1, 0.2]
+        , 'clf__max_depth': [4, 6, 8, 10, 12, 16, 20]
+        , 'clf__min_samples_leaf': [4, 8, 12, 16, 20]
+        , 'clf__max_features': ['auto', 'sqrt']
+        }  
+#%% MLPClassifier()
+# https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html
+    gs_mlp = Pipeline((
+    ('pre' , MinMaxScaler())
+    ,('clf', MLPClassifier(**rs
+                           , **njobs
+                           , max_iter = 500))
+      ))
+
+    gs_mlp_params = {
+        'clf__hidden_layer_sizes': [(1), (2), (3)]
+        , 'clf__max_features': ['auto', 'sqrt']
+        , 'clf__min_samples_leaf': [2, 4, 8]
+        , 'clf__min_samples_split': [10, 20]
+        }
+#%% RidgeClassifier()
+# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RidgeClassifier.html
+    gs_rc = Pipeline((
+    ('pre' , MinMaxScaler()) # CHECK if it wants -1 to 1
+    ,('clf', RidgeClassifier(**rs
+                           , **njobs))
+      ))
+
+    gs_rc_params = {
+        'clf__alpha': [0.1, 0.2, 0.5, 0.8, 1.0]
+        }
+
+#%% SVC()
+# https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
+    gs_svc = Pipeline((
+    ('pre' , MinMaxScaler()) # CHECK if it wants -1 to 1
+    ,('clf', SVC(**rs
+                 , **njobs))
+      ))
+
+    gs_svc_params = {
+        'clf__kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} 
+       , 'clf__C'    : [50, 10, 1.0, 0.1, 0.01]
+       , 'clf__gamma': ['scale', 'auto'] }
+
+#%% BaggingClassifier()
+#https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html
+    gs_bdt = Pipeline((
+    ('pre' , MinMaxScaler()) # CHECK if it wants -1 to 1
+    ,('clf', BaggingClassifier(**rs
+                               , **njobs
+                               , bootstrap = True
+                               , oob_score = True))
+      ))
+
+    gs_bdt_params = {
+        'clf__n_estimators'    : [10, 100, 1000]
+       # If None, then the base estimator is a DecisionTreeClassifier.
+       , 'clf__base_estimator' : ['None', 'SVC()', 'KNeighborsClassifier()']# if none, DT is used
+       , 'clf__gamma': ['scale', 'auto'] }
+#%% GradientBoostingClassifier()
+# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html
+    gs_gb = Pipeline((
+    ('pre' , MinMaxScaler()) # CHECK if it wants -1 to 1
+    ,('clf', GradientBoostingClassifier(**rs))
+      ))
+
+    gs_bdt_params = {
+        'clf__n_estimators'   : [10, 100, 1000]
+        , 'clf__n_estimators' : [10, 100, 1000]
+        , 'clf__learning_rate': [0.001, 0.01, 0.1]
+        , 'clf__subsample'    : [0.5, 0.7, 1.0]
+        , 'clf__max_depth'     : [3, 7, 9]
+        }
+#%% AdaBoostClassifier()
+#https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html#sklearn.ensemble.AdaBoostClassifier
+    gs_gb = Pipeline((
+    ('pre' , MinMaxScaler()) # CHECK if it wants -1 to 1
+    ,('clf', AdaBoostClassifier(**rs))
+      ))
+
+    gs_bdt_params = {
+        'clf__n_estimators': [none, 1, 2]
+       , 'clf__base_estiamtor'  : ['None', 1*SVC(), 1*KNeighborsClassifier()]
+       #, 'clf___splitter'  :   ["best", "random"]
+        }
+#%% GaussianProcessClassifier()
+# https://scikit-learn.org/stable/modules/generated/sklearn.gaussian_process.GaussianProcessClassifier.html
+    #GaussianProcessClassifier(1.0 * RBF(1.0)),
+    gs_gpc = Pipeline((
+    ('pre' , MinMaxScaler()) # CHECK if it wants -1 to 1
+    ,('clf', GaussianProcessClassifier(**rs))
+      ))
+
+    gs_gpc_params = {
+        'clf__kernel': [1*RBF(), 1*DotProduct(), 1*Matern(), 1*RationalQuadratic(), 1*WhiteKernel()]
+        }
+
+#%% GaussianNB()
+# https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html
+    gs_gnb = Pipeline((
+    ('pre' , MinMaxScaler())
+    , ('pca', PCA() )# CHECK if it wants -1 to 1
+    ,('clf', GaussianNB(**rs))
+      ))
+
+    gs_gnb_params = {
+        'clf__priors': [None]
+        , 'clf__var_smoothing': np.logspace(0,-9, num=100)
+        }
+
+#%% QuadraticDiscriminantAnalysis()
+#https://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis.html
+    
+    gs_qda = Pipeline((
+    ('pre' , MinMaxScaler())
+    #, ('pca', PCA() )# CHECK if it wants -1 to 1
+    ,('clf', QuadraticDiscriminantAnalysis())
+      ))
+#%% BernoulliNB()
+# https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.BernoulliNB.html
+    gs_gnb = Pipeline((
+    ('pre' , MinMaxScaler())
+    ,('clf', BernoulliNB())
+      ))
+BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
+    gs_gnb_params = {
+        'clf__alpha': [0, 1]
+        , 'clf__binarize':['None', 0]
+        , 'clf__fit_prior': [True]
+        , 'clf__class_prior': ['None']
+        }
--- a/earlier_versions/GSCV_base
+++ b/earlier_versions/GSCV_base
@ -0,0 +1,128 @@
+# Logistic regression: 
+    pnca
+    input: numerical features
+    output: dm/om: target
+    
+grid search/base estimator with a single model with hyperparamter choices: gives you the best model based on a SINGLE metric!
+    -- question: which is the metric to optimise for?
+base estimator with multipe models and multiple hyperparams: returns the OVERALL best model-hyperparam combo, based on a single score?
+    -- question: which is the metric to optimise for?
+    
+    
+# Demonstration
+
+###################
+# Metric1: accuracy
+###################
+
+Best model:
+ {'clf__max_iter': 100, 'clf__solver': 'liblinear'}
+ 
+Best models score:
+ 0.7145320197044336
+ 
+ 
+###################
+# Metric2: F1
+###################
+Best model:
+ {'clf__max_iter': 100, 'clf__solver': 'saga'}
+ Best models score:
+ 0.7550294183111348
+ 
+ 
+###################
+# Metric3: Recall
+###################
+Best model:
+ {'clf__max_iter': 100, 'clf__solver': 'saga'}
+Best models score:
+ 0.8216666666666667
+ 
+ 
+###################
+# Metric4: ROC_AUC
+###################
+
+Best model:
+ {'clf__max_iter': 200, 'clf__solver': 'sag'}
+Best models score:
+ 0.7711904761904762
+ 
+###################
+# Metric5: MCC
+###################
+
+Best model:
+ {'clf__max_iter': 100, 'clf__solver': 'saga'}
+ Best models score:
+ 0.4322970173039572
+ 
+ sklearn/linear_model/_sag.py:354: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
+ConvergenceWarning,
+
+#####################################
+# Same thing but using: CLFSwitcher()
+
+ 
+###################
+# Metric1: Accuracy
+###################
+
+Best model:
+ {'clf__estimator': LogisticRegression(random_state=42, solver='liblinear')
+ , 'clf__estimator__max_iter': 100, 'clf__estimator__solver': 'liblinear'}
+Best models score:
+ 0.7219298245614035
+ 
+###################
+# Metric2: F1
+###################
+Best model:
+ {'clf__estimator': LogisticRegression(random_state=42, solver='liblinear'), 'clf__estimator__max_iter': 100, 'clf__estimator__solver': 'liblinear'}
+
+print('Best models score:\n', gscv.best_score_)
+Best models score:
+ 0.7585724070894442
+
+###################
+# Metric3: Recall
+###################
+Best model:
+ {'clf__estimator': LogisticRegression(random_state=42, solver='liblinear')
+ , 'clf__estimator__max_iter': 100, 'clf__estimator__solver': 'liblinear'}
+Best models score:
+ 0.8198610213316095
+ 
+###################
+# Metric4: ROC_AUC
+###################
+Best model:
+ {'clf__estimator': LogisticRegression(solver='newton-cg')
+ , 'clf__estimator__max_iter': 100, 'clf__estimator__solver': 'newton-cg'}
+
+Best models score:
+ nan
+ 
+###################
+# Metric5: MCC
+###################
+Best model:
+ {'clf__estimator': LogisticRegression(random_state=42, solver='liblinear')
+ , 'clf__estimator__max_iter': 100, 'clf__estimator__solver': 'liblin
+ 
+Best models score:
+ 0.4480248700902755
+ 
+ 
+ 
+ 
+ 
+ 
+ print('Best model:\n', gs_dt.best_params_)
+Best model:
+ {'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'max_leaf_nodes': 10}
+
+print('Best models score:\n', gs_dt.best_score_)
+Best models score:
+ 0.43290518915746007
--- a/earlier_versions/MultClassPipe.py
+++ b/earlier_versions/MultClassPipe.py
@ -0,0 +1,109 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Mar  4 15:25:33 2022
+
+@author: tanu
+"""
+#%%
+import os, sys
+import pandas as pd
+from sklearn.linear_model import LogisticRegression
+from sklearn.naive_bayes import BernoulliNB
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.svm import SVC
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
+from sklearn.neural_network import MLPClassifier
+from sklearn.pipeline import Pipeline
+from xgboost import XGBClassifier
+from sklearn.preprocessing import StandardScaler
+from sklearn.preprocessing import MinMaxScaler
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score 
+from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef
+#%%
+rs = {'random_state': 42}
+# TODO: add preprocessing step with one hot encoder
+
+# Multiple Classification - Model Pipeline
+def MultClassPipeline(X_train, X_test, y_train, y_test):
+
+    log_reg = LogisticRegression(**rs)
+    nb = BernoulliNB()
+    knn = KNeighborsClassifier()
+    svm = SVC(**rs)
+    mlp = MLPClassifier(max_iter=500, **rs)
+    dt = DecisionTreeClassifier(**rs)
+    et = ExtraTreesClassifier(**rs)
+    rf = RandomForestClassifier(**rs)
+    rf2 = RandomForestClassifier(
+                          min_samples_leaf=50,
+                          n_estimators=150,
+                          bootstrap=True,
+                          oob_score=True,
+                          n_jobs=-1,
+                          random_state=42,
+                          max_features='auto')
+    
+    xgb = XGBClassifier(**rs, verbosity=0)
+
+    clfs = [
+            ('Logistic Regression', log_reg), 
+            ('Naive Bayes', nb),
+            ('K-Nearest Neighbors', knn), 
+            ('SVM', svm), 
+            ('MLP', mlp), 
+            ('Decision Tree', dt), 
+            ('Extra Trees', et), 
+            ('Random Forest', rf), 
+            ('Random Forest2', rf2), 
+            ('XGBoost', xgb)
+            ]
+
+
+    pipelines = []
+
+    scores_df = pd.DataFrame(columns=['Model', 'F1_Score', 'MCC', 'Precision', 'Recall', 'Accuracy', 'ROC_AUC'])
+
+    for clf_name, clf in clfs:
+
+        pipeline = Pipeline(steps=[
+                                   ('scaler', MinMaxScaler()),
+                                   #('scaler', StandardScaler()),
+                                   ('classifier', clf)
+                                   ]
+                            )
+        pipeline.fit(X_train, y_train)
+
+        # Model predictions
+        y_pred  = pipeline.predict(X_test)
+        
+        # F1-Score
+        fscore  = f1_score(y_test, y_pred)
+        # Precision
+        pres    = precision_score(y_test, y_pred)
+        # Recall
+        recall   = recall_score(y_test, y_pred)
+        # Accuracy
+        accu    = accuracy_score(y_test, y_pred)
+        # ROC_AUC
+        roc_auc = roc_auc_score(y_test, y_pred)
+        # Matthews correlation coefficient
+        mcc =  matthews_corrcoef(y_test, y_pred)
+        
+        pipelines.append(pipeline)
+
+        scores_df = scores_df.append({
+                                      'Model'       : clf_name
+                                      , 'F1_Score'  : fscore
+                                      , 'MCC'       : mcc
+                                      , 'Precision' : pres
+                                      , 'Recall'    : recall
+                                      , 'Accuracy'  : accu
+                                      , 'ROC_AUC'   : roc_auc
+                                      }
+                                     , ignore_index = True)
+        
+    return pipelines, scores_df
+
--- a/earlier_versions/SKF_SSF.txt
+++ b/earlier_versions/SKF_SSF.txt
@ -0,0 +1,48 @@
+# Stratified K-fold vs ShuffleSplit
+
+https://stackoverflow.com/questions/45969390/difference-between-stratifiedkfold-and-stratifiedshufflesplit-in-sklearn
+
+In ShuffleSplit, the data is shuffled every time, and then split. This means the test sets may overlap between the splits.
+In SKF, test sets don't overlap
+
+So, the difference here is that StratifiedKFold just shuffles and splits once, therefore the test sets do not overlap, while StratifiedShuffleSplit shuffles each time before splitting, and it splits n_splits times, the test sets can overlap. 
+
+Note: the two methods uses "stratified fold" (that why "stratified" appears in both names). It means each part preserves the same percentage of samples of each class (label) as the original data. You can read more at cross_validation documents
+
+
+''' python code '''
+splits = 5
+
+tx = range(10)
+ty = [0] * 5 + [1] * 5
+
+from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
+from sklearn import datasets
+
+kfold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=42)
+shufflesplit = StratifiedShuffleSplit(n_splits=splits, random_state=42, test_size=2)
+
+print("KFold")
+for train_index, test_index in kfold.split(tx, ty):
+    print("TRAIN:", train_index, "TEST:", test_index)
+
+print("Shuffle Split")
+for train_index, test_index in shufflesplit.split(tx, ty):
+    print("TRAIN:", train_index, "TEST:", test_index)
+
+'''
+Output:
+
+KFold
+TRAIN: [0 2 3 4 5 6 7 9] TEST: [1 8]
+TRAIN: [0 1 2 3 5 7 8 9] TEST: [4 6]
+TRAIN: [0 1 3 4 5 6 8 9] TEST: [2 7]
+TRAIN: [1 2 3 4 6 7 8 9] TEST: [0 5]
+TRAIN: [0 1 2 4 5 6 7 8] TEST: [3 9]
+
+Shuffle Split
+TRAIN: [8 4 1 0 6 5 7 2] TEST: [3 9]
+TRAIN: [7 0 3 9 4 5 1 6] TEST: [8 2]
+TRAIN: [1 2 5 6 4 8 9 0] TEST: [3 7]
+TRAIN: [4 6 7 8 3 5 1 2] TEST: [9 0]
+TRAIN: [7 2 6 5 4 3 0 9] TEST: [1 8]
--- a/earlier_versions/comp_results
+++ b/earlier_versions/comp_results
@ -0,0 +1,29 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Mar 10 10:59:36 2022
+
+@author: tanu
+"""
+# numerical
+#log_reg  (rs)
+F1_score     0.713380
+MCC          0.376546
+Precision    0.687628
+Recall       0.747231
+Accuracy     0.687293
+ROC_curve    0.683199
+#log_reg  (balanced)
+F1_score     0.715106
+MCC          0.390225
+Precision    0.702629
+Recall       0.733445
+Accuracy     0.694309
+ROC_curve    0.691555
+#log_reg  (unbalanced)
+F1_score     0.713380
+MCC          0.376546
+Precision    0.687628
+Recall       0.747231
+Accuracy     0.687293
+ROC_curve    0.683199
--- a/earlier_versions/imports_v1.py
+++ b/earlier_versions/imports_v1.py
@ -0,0 +1,229 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Sun Mar  6 13:41:54 2022
+
+@author: tanu
+"""
+import os, sys
+import pandas as pd
+import numpy as np
+#from copy import deepcopy
+from sklearn import linear_model
+from sklearn.linear_model import LogisticRegression, LinearRegression
+from sklearn.naive_bayes import BernoulliNB
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.svm import SVC
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
+from sklearn.neural_network import MLPClassifier
+from xgboost import XGBClassifier
+from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
+
+from sklearn.compose import ColumnTransformer
+from sklearn.compose import make_column_transformer
+
+from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score 
+from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef
+from sklearn.metrics import make_scorer
+from sklearn.metrics import classification_report
+
+
+from sklearn.metrics import average_precision_score
+
+from sklearn.model_selection import cross_validate
+from sklearn.model_selection import train_test_split
+from sklearn.model_selection import StratifiedKFold
+
+from sklearn.pipeline import Pipeline
+from sklearn.pipeline import make_pipeline
+
+from sklearn.feature_selection import RFE
+from sklearn.feature_selection import RFECV
+
+import itertools
+import seaborn as sns
+import matplotlib.pyplot as plt
+import numpy as np
+print(np.__version__)
+print(pd.__version__)
+from statistics import mean, stdev, median, mode
+
+#%%
+homedir = os.path.expanduser("~")
+os.chdir(homedir + "/git/ML_AI_training/")
+
+# my function
+from MultClassPipe import MultClassPipeline
+from MultClassPipe2 import MultClassPipeline2
+from loopity_loop import MultClassPipeSKF
+
+gene = 'pncA'
+drug = 'pyrazinamide'
+
+#==============
+# directories
+#==============
+datadir = homedir + '/git/Data/'
+indir   = datadir + drug + '/input/'
+outdir  = datadir + drug + '/output/'
+
+#=======
+# input
+#=======
+infile_ml1 = outdir + gene.lower() + '_merged_df3.csv' 
+#infile_ml2 = outdir + gene.lower() + '_merged_df2.csv'
+
+my_df = pd.read_csv(infile_ml1)
+my_df.dtypes
+my_df_cols = my_df.columns
+
+geneL_basic     = ['pnca']
+geneL_na        = ['gid']
+geneL_na_ppi2   = ['rpob']
+geneL_ppi2      = ['alr', 'embb', 'katg']
+#%% get cols
+mycols = my_df.columns
+
+my_df['active_aa_pos'].dtype
+my_df['active_aa_pos'] = my_df['active_aa_pos'].astype(object)
+
+if gene.lower() in geneL_na_ppi2:
+    #D1148 get rid of
+    na_index = my_df['mutationinformation'].index[my_df['mcsm_na_affinity'].apply(np.isnan)]
+    my_df = my_df.drop(index=na_index)
+
+#%%============================================================================
+# GET Y
+
+# Target1: mutation_info_labels
+dm_om_map = {'DM': 1, 'OM': 0}
+target1 = my_df['mutation_info_labels'].map(dm_om_map)
+target1.value_counts()
+
+# Target2: drug
+drug_labels = drug + '_labels'
+drug_labels
+my_df[drug_labels] = my_df[drug].map({1: 'resistant', 0: 'sensitive'})
+my_df[drug_labels].value_counts()
+my_df[drug_labels] = my_df[drug_labels].fillna('unknown')
+my_df[drug_labels].value_counts()
+target2 = my_df[drug_labels]
+
+# Target3: drtype [Binary]
+drtype_labels = 'drtype_labels'
+my_df[drtype_labels] = my_df['drtype'].map({'Sensitive'      : 0
+                                                 , 'Other'   : 0
+                                                 , 'Pre-MDR' : 1
+                                                 , 'MDR'     : 1
+                                                 , 'Pre-XDR' : 1
+                                                 , 'XDR'     : 1})
+# target3 = 'drtype' [Multinomial]
+target3 = my_df[drtype_labels]
+
+# target4
+drtype_labels2 = 'drtype_labels2'
+my_df[drtype_labels2] = my_df['drtype'].map({'Sensitive'     : 0
+                                                 , 'Other'   : 0
+                                                 , 'Pre-MDR' : 1
+                                                 , 'MDR'     : 1
+                                                 , 'Pre-XDR' : 2
+                                                 , 'XDR'     : 2})
+target4 = my_df[drtype_labels2]
+
+# sanity checks
+target1.value_counts()
+my_df['mutation_info_labels'].value_counts()
+
+target2.value_counts()
+my_df[drug_labels].value_counts()
+
+target3.value_counts()
+my_df['drtype'].value_counts()
+target4.value_counts()
+my_df['drtype'].value_counts()
+
+#%%
+# GET X
+common_cols_stabiltyN = ['ligand_distance'
+           , 'ligand_affinity_change'
+           , 'duet_stability_change'
+           , 'ddg_foldx'
+           , 'deepddg'
+           , 'ddg_dynamut2']
+
+# Build stability columns ~ gene
+if gene.lower() in geneL_basic:
+    x_stabilityN = common_cols_stabiltyN
+    
+if gene.lower() in geneL_ppi2:
+    x_stabilityN = common_cols_stabiltyN + ['mcsm_ppi2_affinity'
+                                               , 'interface_dist'] 
+if gene.lower() in geneL_na:
+    x_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] 
+
+if gene.lower() in geneL_na_ppi2:
+    x_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] 
+    #D1148 get rid of
+    #na_index = my_df['mutationinformation'].index[my_df['mcsm_na_affinity'].apply(np.isnan)]
+    #my_df = my_df.drop(index=na_index)
+    
+X_strFN =  ['asa'
+           , 'rsa'
+           , 'kd_values'
+           , 'rd_values']    
+
+X_evolFN =  ['consurf_score'
+           , 'snap2_score'
+           , 'snap2_accuracy_pc']
+
+# TODO: ADD ED values
+# Problematic due to NA: filling NA with unknown or string will make it categorical
+# OPTIONS
+# 1. Imputing: KNN or MICE or from distribution
+# 2. Fill na with median or mode
+# 3. Separate datset without including genomic features AT ALL for ML, then using this as a 'blind test set'
+    # this means the size of the training data gets reduced!
+# 4. Remove genomic features from ML COMPLETELEY!
+
+# X_genomicFN =  ['af'
+#            , 'or_mychisq'
+#            , 'or_logistic'
+#            , 'or_fisher'
+#            , 'pval_fisher']
+
+#%% try combinations
+X_vars1 = my_df[x_stabilityN] 
+X_vars2 = my_df[X_strFN] 
+X_vars3 = my_df[X_evolFN] 
+
+X_vars5  = my_df[x_stabilityN + X_strFN]
+X_vars6  = my_df[x_stabilityN + X_evolFN]
+#X_vars7  = my_df[x_stabilityN + X_genomicFN]
+X_vars8  = my_df[X_strFN + X_evolFN]
+#X_vars9  = my_df[X_strFN + X_genomicFN]
+#X_vars10 = my_df[X_evolFN + X_genomicFN]
+X_vars11 = my_df[x_stabilityN + X_strFN + X_evolFN]
+#X_vars12 = my_df[x_stabilityN + X_strFN + X_evolFN + X_genomicFN]
+
+numerical_features_names = x_stabilityN + X_strFN + X_evolFN
+
+# separate ones for foldx?
+categorical_features_names = ['ss_class'
+                           , 'wt_prop_water'
+                          # , 'lineage_labels' # misleading if using merged_df3
+                           , 'mut_prop_water'
+                           , 'wt_prop_polarity'
+                           , 'mut_prop_polarity'
+                           , 'wt_calcprop'
+                           , 'mut_calcprop'
+                           , 'active_aa_pos']
+
+numerical_features_df = my_df[numerical_features_names]
+numerical_features_df.shape
+
+categorical_features_df = my_df[categorical_features_names]
+categorical_features_df.shape
+
+all_features_df = my_df[numerical_features_names + categorical_features_names]
+all_features_df.shape
--- a/earlier_versions/loopity_detangle.py
+++ b/earlier_versions/loopity_detangle.py
@ -0,0 +1,82 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Mar 10 18:06:34 2022
+
+@author: tanu
+"""
+
+#%%
+models = [
+        ('Logistic Regression'  , log_reg) 
+        , ('K-Nearest Neighbors', knn) 
+        ]
+
+classification_metrics = {
+    'F1_score': []
+    ,'MCC': []
+    ,'Precision': []
+    ,'Recall': []
+    ,'Accuracy': []
+    ,'ROC_curve': []
+    }
+
+folds=[1,2]
+fold_no=1
+fold_dict={}
+for model_name, model in models:
+    fold_dict.update({model_name: {}})
+
+for f in folds:
+    fold=("fold_"+str(fold_no))
+    for model_name, model in models:
+        print("start of model", model_name, "fold: ", fold)
+        fold_dict[model_name].update({fold: {}})
+        fold_dict[model_name][fold].update(classification_metrics)
+
+        print("end of model", model_name, "fold: ", fold)
+        fold_dict[model_name][fold].update({'F1_score': random.randrange(1,10)})
+    fold_no +=1
+    pp.pprint(fold_dict)
+    
+
+#%%
+folds_f1=[]
+
+for model_name, model in models:
+    print("Calculating mean for F1_score for: ", model_name)
+    #for key in fold_dict['Logistic Regression']:
+        # wrap this in a classification_metric for loop
+    for key in fold_dict[model_name]:
+        folds_f1.append(fold_dict['Logistic Regression'][key]['F1_score'])
+        #folds_f1.append(folds_f1)
+        print('key:', key, 'F1scores:', folds_f1)
+mean(folds_f1)
+#%%
+scores_df = pd.DataFrame(columns=['Model', 'F1_Score', 'MCC', 'Precision', 'Recall', 'Accuracy', 'ROC_AUC'])
+
+# manually
+model_name = 'Logistic Regression'
+model_metric = 'F1_score'
+
+log_reg_f1 = []
+for key in fold_dict[model_name]:
+    log_reg_f1.append(fold_dict[model_name][key][model_metric])
+    log_reg_f1M = mean(log_reg_f1)
+    print('key:', key, model_metric, ':', log_reg_f1)
+print(log_reg_f1M)
+
+log_reg_f1df = pd.DataFrame({model_name: [log_reg_f1M]}, index = [model_metric])
+log_reg_f1df
+
+#%%
+model_metric = 'MCC'
+log_reg_mcc = []
+for key in fold_dict[model_name]:
+    log_reg_mcc.append(fold_dict[model_name][key][model_metric])
+    log_reg_mccM = mean(log_reg_mcc)
+    print('key:', key, model_metric, ':', log_reg_mcc)
+print(log_reg_mccM)
+
+log_reg_mccdf = pd.DataFrame({model_name: [log_reg_mccM]}, index = [model_metric])
+log_reg_mccdf
--- a/earlier_versions/my_data5_results_pnca
+++ b/earlier_versions/my_data5_results_pnca
@ -0,0 +1,84 @@
+ # stabilty [6]
+                  Model  F1_Score  Precision    Recall  Accuracy   ROC_AUC
+ 0  Logistic Regression  0.738854   0.698795  0.783784  0.707143  0.702498
+ 1          Naive Bayes  0.627451   0.607595  0.648649  0.592857  0.589476
+ 2  K-Nearest Neighbors  0.731707   0.666667  0.810811  0.685714  0.678133
+ 3                  SVM  0.729412   0.645833  0.837838  0.671429  0.661343
+ 4                  MLP  0.670968   0.641975  0.702703  0.635714  0.631654
+ 5        Decision Tree  0.653595   0.632911  0.675676  0.621429  0.618141
+ 6          Extra Trees  0.733728   0.652632  0.837838  0.678571  0.668919
+ 7        Random Forest  0.726190   0.648936  0.824324  0.671429  0.662162
+ 8              XGBoost  0.704403   0.658824  0.756757  0.664286  0.658681)
+  
+ # evolution [3]
+                   Model  F1_Score  Precision    Recall  Accuracy   ROC_AUC
+ 0  Logistic Regression  0.795181   0.717391  0.891892  0.757143  0.748976
+ 1          Naive Bayes  0.805031   0.752941  0.864865  0.778571  0.773342
+ 2  K-Nearest Neighbors  0.735484   0.703704  0.770270  0.707143  0.703317
+ 3                  SVM  0.797619   0.712766  0.905405  0.757143  0.748157
+ 4                  MLP  0.787879   0.714286  0.878378  0.750000  0.742219
+ 5        Decision Tree  0.631579   0.615385  0.648649  0.600000  0.597052
+ 6          Extra Trees  0.688312   0.662500  0.716216  0.657143  0.653563
+ 7        Random Forest  0.704403   0.658824  0.756757  0.664286  0.658681
+ 8              XGBoost  0.713376   0.674699  0.756757  0.678571  0.673833)
+
+# str features [4]
+                  Model  F1_Score  Precision    Recall  Accuracy   ROC_AUC
+ 0  Logistic Regression  0.729412   0.645833  0.837838  0.671429  0.661343
+ 1          Naive Bayes  0.723926   0.662921  0.797297  0.678571  0.671376
+ 2  K-Nearest Neighbors  0.662338   0.637500  0.689189  0.628571  0.624898
+ 3                  SVM  0.727273   0.627451  0.864865  0.657143  0.644554
+ 4                  MLP  0.710843   0.641304  0.797297  0.657143  0.648649
+ 5        Decision Tree  0.561151   0.600000  0.527027  0.564286  0.566544
+ 6          Extra Trees  0.567376   0.597015  0.540541  0.564286  0.565725
+ 7        Random Forest  0.596026   0.584416  0.608108  0.564286  0.561630
+ 8              XGBoost  0.630872   0.626667  0.635135  0.607143  0.605446)
+ 
+ #=========================================================================
+ # stability + evolution + str features [13 = 6+3+4]
+                   Model  F1_Score  Precision    Recall  Accuracy   ROC_AUC
+ 0  Logistic Regression  0.726115   0.686747  0.770270  0.692857  0.688165
+ 1          Naive Bayes  0.730769   0.695122  0.770270  0.700000  0.695741
+ 2  K-Nearest Neighbors  0.742515   0.666667  0.837838  0.692857  0.684070
+ 3                  SVM  0.763636   0.692308  0.851351  0.721429  0.713554
+ 4                  MLP  0.717949   0.682927  0.756757  0.685714  0.681409
+ 5        Decision Tree  0.671429   0.712121  0.635135  0.671429  0.673628
+ 6          Extra Trees  0.756410   0.719512  0.797297  0.728571  0.724406
+ 7        Random Forest  0.742138   0.694118  0.797297  0.707143  0.701679
+ 8              XGBoost  0.692810   0.670886  0.716216  0.664286  0.661138)
+  
+ # stability + evolution [9=6+3]
+                   Model  F1_Score  Precision    Recall  Accuracy   ROC_AUC
+ 0  Logistic Regression  0.729560   0.682353  0.783784  0.692857  0.687346
+ 1          Naive Bayes  0.743590   0.707317  0.783784  0.714286  0.710074
+ 2  K-Nearest Neighbors  0.720497   0.666667  0.783784  0.678571  0.672195
+ 3                  SVM  0.771084   0.695652  0.864865  0.728571  0.720311
+ 4                  MLP  0.679739   0.658228  0.702703  0.650000  0.646806
+ 5        Decision Tree  0.620690   0.633803  0.608108  0.607143  0.607084
+ 6          Extra Trees  0.727273   0.700000  0.756757  0.700000  0.696560
+ 7        Random Forest  0.734177   0.690476  0.783784  0.700000  0.694922
+ 8              XGBoost  0.675497   0.662338  0.689189  0.650000  0.647625)
+ 
+ # stability + str features [10=6+4]
+                   Model  F1_Score  Precision    Recall  Accuracy   ROC_AUC
+ 0  Logistic Regression  0.750000   0.697674  0.810811  0.714286  0.708436
+ 1          Naive Bayes  0.714286   0.687500  0.743243  0.685714  0.682228
+ 2  K-Nearest Neighbors  0.687500   0.639535  0.743243  0.642857  0.636773
+ 3                  SVM  0.743902   0.677778  0.824324  0.700000  0.692465
+ 4                  MLP  0.716981   0.670588  0.770270  0.678571  0.673014
+ 5        Decision Tree  0.616438   0.625000  0.608108  0.600000  0.599509
+ 6          Extra Trees  0.697368   0.679487  0.716216  0.671429  0.668714
+ 7        Random Forest  0.684211   0.666667  0.702703  0.657143  0.654382
+ 8              XGBoost  0.666667   0.645570  0.689189  0.635714  0.632473)
+ 
+ # evolution + str features[7=3+4]
+                   Model  F1_Score  Precision    Recall  Accuracy   ROC_AUC
+ 0  Logistic Regression  0.773006   0.707865  0.851351  0.735714  0.728706
+ 1          Naive Bayes  0.750000   0.730769  0.770270  0.728571  0.726044
+ 2  K-Nearest Neighbors  0.737500   0.686047  0.797297  0.700000  0.694103
+ 3                  SVM  0.763636   0.692308  0.851351  0.721429  0.713554
+ 4                  MLP  0.775758   0.703297  0.864865  0.735714  0.727887
+ 5        Decision Tree  0.675497   0.662338  0.689189  0.650000  0.647625
+ 6          Extra Trees  0.715232   0.701299  0.729730  0.692857  0.690622
+ 7        Random Forest  0.715232   0.701299  0.729730  0.692857  0.690622
+ 8              XGBoost  0.721519   0.678571  0.770270  0.685714  0.680590)
--- a/earlier_versions/my_data_modelpipe.py
+++ b/earlier_versions/my_data_modelpipe.py
@ -0,0 +1,156 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Mar  3 17:08:18 2022
+
+@author: tanu
+"""
+from sklearn.linear_model import LogisticRegression
+from sklearn.naive_bayes import BernoulliNB
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.svm import SVC
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
+from sklearn.neural_network import MLPClassifier
+from sklearn.pipeline import Pipeline
+from xgboost import XGBClassifier
+from sklearn.preprocessing import StandardScaler
+from sklearn.model_selection import train_test_split
+import os
+from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score
+import pandas as pd
+#%%
+homedir = os.path.expanduser("~")
+os.chdir(homedir + "/git/ML_AI_training/test_data")
+
+# this needs to be merged_df2 or merged_df3?
+#gene 'pncA'
+drug = 'pyrazinamide'
+
+my_df = pd.read_csv("pnca_merged_df3.csv")
+
+my_df.dtypes
+my_df_cols = my_df.columns
+
+#%%============================================================================
+# GET Y
+# Y = my_df.loc[:,drug] #has NA
+dm_om_map = {'DM': 1, 'OM': 0}
+my_df['resistance'] = my_df['mutation_info_labels'].map(dm_om_map)
+
+# sanity check
+my_df['resistance'].value_counts()
+my_df['mutation_info_labels'].value_counts()
+Y = my_df['resistance']
+
+# GET X
+cols = my_df.columns
+X_stability = my_df[['ligand_distance'
+           , 'ligand_affinity_change'
+           , 'duet_stability_change'
+           , 'ddg_foldx'
+           , 'deepddg'
+           , 'ddg_dynamut2']]
+
+X_evol =  my_df[['consurf_score'
+           , 'snap2_score'
+           , 'snap2_accuracy_pc']]
+
+X_str =  my_df[['asa'
+           , 'rsa'
+           , 'kd_values'
+           , 'rd_values']]
+
+#%% try combinations
+X_vars = X_stability
+X_vars = X_evol
+X_vars = X_str
+
+X_vars = pd.concat([X_stability, X_evol, X_str], axis = 1)
+X_vars = pd.concat([X_stability, X_evol], axis = 1)
+X_vars = pd.concat([X_stability, X_str], axis = 1)
+X_vars = pd.concat([X_evol, X_str], axis = 1)
+
+#%%
+X_vars.shape[1]
+
+# TODO: stratified cross validate
+# Train-test Split
+rs = {'random_state': 42}
+X_train, X_test, y_train, y_test = train_test_split(X_vars, 
+                                                    Y, 
+                                                    test_size = 0.33, 
+                                                    random_state = 42)
+
+# Classification - Model Pipeline
+def modelPipeline(X_train, X_test, y_train, y_test):
+
+    log_reg = LogisticRegression(**rs)
+    nb = BernoulliNB()
+    knn = KNeighborsClassifier()
+    svm = SVC(**rs)
+    mlp = MLPClassifier(max_iter=500, **rs)
+    dt = DecisionTreeClassifier(**rs)
+    et = ExtraTreesClassifier(**rs)
+    rf = RandomForestClassifier(**rs)
+    xgb = XGBClassifier(**rs, verbosity=0)
+
+    clfs = [
+            ('Logistic Regression', log_reg), 
+            ('Naive Bayes', nb),
+            ('K-Nearest Neighbors', knn), 
+            ('SVM', svm), 
+            ('MLP', mlp), 
+            ('Decision Tree', dt), 
+            ('Extra Trees', et), 
+            ('Random Forest', rf), 
+            ('XGBoost', xgb)
+            ]
+
+
+    pipelines = []
+
+    scores_df = pd.DataFrame(columns=['Model', 'F1_Score', 'Precision', 'Recall', 'Accuracy', 'ROC_AUC'])
+
+
+    for clf_name, clf in clfs:
+
+        pipeline = Pipeline(steps=[
+                                   ('scaler', StandardScaler()),
+                                   ('classifier', clf)
+                                   ]
+                            )
+        pipeline.fit(X_train, y_train)
+
+        # Model predictions
+        y_pred  = pipeline.predict(X_test)
+        
+        # F1-Score
+        fscore  = f1_score(y_test, y_pred)
+        # Precision
+        pres    = precision_score(y_test, y_pred)
+        # Recall
+        rcall   = recall_score(y_test, y_pred)
+        # Accuracy
+        accu    = accuracy_score(y_test, y_pred)
+        # ROC_AUC
+        roc_auc = roc_auc_score(y_test, y_pred)
+
+
+        pipelines.append(pipeline)
+
+        scores_df = scores_df.append({
+                                      'Model'     : clf_name, 
+                                      'F1_Score'  : fscore,
+                                      'Precision' : pres,
+                                      'Recall'    : rcall,
+                                      'Accuracy'  : accu,
+                                      'ROC_AUC'   : roc_auc
+                                      
+                                      }, 
+                                     ignore_index = True)
+        
+    return pipelines, scores_df
+
+
+modelPipeline(X_train, X_test, y_train, y_test)
--- a/earlier_versions/my_data_target_counts.py
+++ b/earlier_versions/my_data_target_counts.py
@ -0,0 +1,81 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Mar  3 17:08:18 2022
+
+@author: tanu
+"""
+#%% load packages
+import sys, os
+import pandas as pd
+from pandas import DataFrame
+import numpy as np
+import argparse
+from functools import reduce
+#%%
+homedir = os.path.expanduser("~")
+os.chdir(homedir + "/git/ML_AI_training/test_data")
+
+#gene = ''
+#drug = ''
+
+#==============
+# directories
+#==============
+datadir = homedir + '/git/Data/'
+indir = datadir + drug + '/input/'
+outdir = datadir + drug + '/output/'
+
+# gene_baiscL = ['pnca']
+# geneL_naL   = ['gid', 'rpob']
+# geneL_ppi2L = ['alr', 'embb', 'katg', 'rpob']
+
+#=======
+# input
+#=======
+infile_ml1 = outdir + gene.lower() + '_merged_df3.csv' 
+#infile_ml2 = outdir + gene.lower() + '_merged_df2.csv'
+ 
+my_df = pd.read_csv(infile_ml1)
+
+my_df.dtypes
+my_df_cols = my_df.columns
+
+#%%============================================================================
+# GET Y
+drug_labels = drug + '_labels'
+drug_labels
+my_df[drug_labels]  = my_df[drug] 
+my_df[drug_labels].value_counts()
+my_df[drug_labels] = my_df[drug].map({1: 'resistant', 0: 'sensitive'})
+my_df[drug_labels].value_counts()
+my_df[drug_labels] = my_df[drug_labels].fillna('unknown')
+my_df[drug_labels].value_counts()
+
+mutC = my_df[[ 'mutationinformation']].count()
+
+target1C = my_df['mutation_info_labels'].value_counts()
+
+target2C = my_df[drug_labels].value_counts()
+#target2C.index = target2C.index.to_series().map({1: 'resistant', 0: 'sensitive'})
+
+target3C = my_df['drtype'].value_counts()
+
+targetsC = pd.concat([mutC, target1C, target2C, target3C])
+targetsC
+
+# targetsC2 = pd.concat([mutC, target1C, target2C
+#                        #, target3C
+#                        ], axis = 1)
+# targetsC2
+
+#%% try combinations
+# X_vars = X_stability
+# X_vars = X_evol
+# X_vars = X_str
+
+# X_vars = pd.concat([X_stability, X_evol, X_str], axis = 1)
+# X_vars = pd.concat([X_stability, X_evol], axis = 1)
+# X_vars = pd.concat([X_stability, X_str], axis = 1)
+# X_vars = pd.concat([X_evol, X_str], axis = 1)
+
--- a/earlier_versions/my_datap1.py
+++ b/earlier_versions/my_datap1.py
@ -0,0 +1,212 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Feb 24 10:48:10 2022
+
+@author: tanu
+"""
+###############################################################################
+# questions:
+ # which data to use: merged_df3 or merged_df2
+ # which is the target? or_mychisq or drtype col
+ # scaling: can it be from -1 to 1?
+ 
+# strategy:
+    # available data = X_train
+    # available data but NAN = validation_test
+    # test data: mut generated not in mcsm
+
+###############################################################################
+import os, sys
+import re
+from sklearn.datasets import load_boston
+from sklearn import linear_model
+from sklearn import preprocessing
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+import numpy as np
+print(np.__version__)
+print(pd.__version__)
+#%% read data
+homedir = os.path.expanduser("~")
+os.chdir(homedir + "/git/ML_AI_training/test_data")
+
+# this needs to be merged_df2 or merged_df3?
+my_df = pd.read_csv("pnca_all_params.csv")
+
+my_df.dtypes
+my_df_cols = my_df.columns
+
+omit_cols1 = ['pdb_file'
+             , 'seq_offset4pdb'
+             , 'mut_3upper'
+             , 'wild_pos'
+             , 'wild_chain_pos'
+             , 'chain'
+             , 'wt_3upper'
+             , 'consurf_colour'
+             , 'consurf_colour_rev'
+             , 'consurf_msa_data'
+             , 'consurf_aa_variety'
+             , 'snap2_accuracy_pc'
+             , 'beta_logistic'
+             , 'se_logistic'
+             , 'zval_logisitc'
+             , 'pval_chisq'
+             , 'log10_or_mychisq'
+             , 'neglog_pval_fisher'
+             , 'wild_type'
+             , 'mutant_type'
+             , 'position'
+             , 'ligand_id'
+             , 'mutation'
+             , 'ss'
+             , 'ss_class' # include it later?
+             ]
+
+omit_cols2 = list(my_df.columns[my_df.columns.str.contains(".*ci_.*") | my_df.columns.str.contains(".*_scaled*") | my_df.columns.str.contains(".*_outcome*")])
+
+# [WATCH:] just to test since these have negative values!
+omit_cols3 = list(my_df.columns[my_df.columns.str.contains("electro_.*") | my_df.columns.str.contains("disulfide_.*") | my_df.columns.str.contains("hbonds_.*") | my_df.columns.str.contains("partcov_.*") | my_df.columns.str.contains("vdwclashes.*") | my_df.columns.str.contains("volumetric.*")])
+
+omit_cols = omit_cols1 + omit_cols2 + omit_cols3
+
+my_df_filt = my_df.loc[:, ~my_df.columns.isin(omit_cols)]
+my_df_filt_cols = my_df_filt.columns
+
+foo = my_df_filt['or_mychisq'].value_counts() 
+foo = foo.to_frame()
+########################
+# [WATCH]: Drop na
+my_df2 = my_df_filt.dropna()
+my_df2['resistance'] = my_df2['or_mychisq'].apply(lambda x: 0 if x <=1 else 1)
+my_df2['resistance'].value_counts()
+y = my_df2['resistance']
+
+#==============================================================================
+omit_cols_y = ['or_mychisq', 'resistance']
+my_df_ml = my_df2.loc[:, ~my_df2.columns.isin(omit_cols_y)]
+#%%############################################################################
+X_train = my_df_ml.set_index('mutationinformation')
+X_train = X_train.iloc[:,:4]
+y_train  = y
+#X_train = X_train.dropna()
+#y_train = y.dropna()
+
+# check dim
+X_train.shape
+y_train.shape
+
+#%%=====================================================
+grid = sns.PairGrid(data = pd.concat([X_train
+                                    , pd.Series(y_train , name = "resistance")]
+                                    , axis = 1))
+
+grid.map_offdiag(sns.scatterplot)
+grid.map_diag(sns.distplot)
+plt.show()
+
+model = linear_model.LinearRegression()
+model.fit(X_train, y_train)
+
+###################
+#  test set
+X_test = my_df[my_df['or_mychisq'].isnull()]
+#X_test =[ X_test.iloc[:,:4]]
+# HARD part?
+# what should be the test set?
+X_test = [23.9, 0.69, -0.16, 0.59
+    , 5, 0.5, 0.4, -1
+    , 0.1, 1, 1, 1] 
+X_test_re = np.array(X_test).reshape(3, -1)
+
+
+####################
+fitted = model.predict(X_train)
+model.coef_
+model.predict(X_test_re)
+resid = y_train - fitted
+resid
+
+#####################
+from sklearn import preprocessing
+scaler = preprocessing.MinMaxScaler()
+scaler.fit(X_train)
+#We can then create a scaled training set
+X_train_scaled = scaler.transform(X_train)
+new_scaled = scaler.transform(X_test_re)
+model.predict(new_scaled)
+#########
+from sklearn.pipeline import Pipeline
+from sklearn.linear_model import LogisticRegression
+# model_pipe = Pipeline(steps = [('preprocess', preprocessing.MinMaxScaler())
+#                           ,('regression', linear_model.LinearRegression())
+#     ])
+model_pipe = Pipeline(steps = [('preprocess', preprocessing.MinMaxScaler())
+                              ,('logis', LogisticRegression(class_weight = 'balanced'))
+                          ])
+
+model_pipe.fit(X_train,y_train)
+fitted_vals = model_pipe.predict(X_train)
+# gives the array of predictions
+model_pipe.predict(X_test_re)
+
+# for Linear Reg only
+# resid = y_train - fitted_vals
+# resid  
+
+#=====
+# Logistic  1 test
+# FAILS since: the test set dim and input dim should be the same
+# i.e if you give the model 10 features to train on, you will need
+# 10 features to predict something?
+# THINK!!!!
+#=====
+mod_logis = linear_model.LogisticRegression(class_weight = 'balanced')
+mod_logis.fit(X_train,y_train)
+X_test = [23.9] 
+X_test_re = np.array(X_test).reshape(1, -1)
+mod_logis.predict(X_test_re)
+#################
+
+from sklearn.metrics import accuracy_score, precision_score, recall_score
+y_pred = model_pipe.predict(X_train)
+accuracy_score(y_train,y_pred)
+precision_score(y_train,y_pred,pos_label=1)# tp/(tp + fp) 
+recall_score(y_train,y_pred,pos_label=1) # tp/(tp + fn)
+########
+
+# WORKS!
+from sklearn.model_selection import cross_validate
+from sklearn.metrics import make_scorer
+import pandas as pd
+acc = make_scorer(accuracy_score)
+def precision(y_true,y_pred):
+    return precision_score(y_true,y_pred,pos_label = 1) #0
+
+def recall(y_true,y_pred):
+    return recall_score(y_true, y_pred, pos_label = 1) #0
+
+prec = make_scorer(precision)
+rec = make_scorer(recall)
+output = cross_validate(model_pipe
+                        , X_train
+                        , y_train
+                        , scoring = {'acc' : acc
+                                     ,'prec' : prec
+                                     ,'rec' : rec}
+                        , cv = 10, return_train_score = False)
+
+pd.DataFrame(output).mean()
+# fit_time      0.005486
+# score_time    0.002673
+# test_acc      0.601799
+# test_prec     0.976936
+# test_rec      0.603226
+# dtype: float64
+
+# the three scores
+# 0.65527950310559
+# 0.9853658536585366
+# 0.6516129032258065
--- a/earlier_versions/my_datap10.py
+++ b/earlier_versions/my_datap10.py
@ -0,0 +1,272 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Sat Mar  5 12:57:32 2022
+
+@author: tanu
+"""
+#%%
+# Data, etc for now  comes from my_data6.py and/or my_data5.py
+#%% Specify dir and import functions
+homedir = os.path.expanduser("~")
+os.chdir(homedir + "/git/ML_AI_training/")
+#%% Try combinations
+#import sys, os
+#os.system("imports.py")
+def precision(y_true,y_pred):
+    return precision_score(y_true,y_pred,pos_label = 1)
+def recall(y_true,y_pred):
+    return recall_score(y_true, y_pred, pos_label = 1)
+def f1(y_true,y_pred):
+    return f1_score(y_true, y_pred, pos_label = 1)
+
+#%% Check df features
+numerical_features_df.shape
+categorical_features_df.shape
+all_features_df.shape
+all_features_df.dtypes
+#%% Simple train and test data splits
+target = target1
+#target = target3
+X_trainN, X_testN, y_trainN, y_testN = train_test_split(numerical_features_df, 
+                                                    target, 
+                                                    test_size = 0.33, 
+                                                    random_state = 42)
+
+X_trainC, X_testC, y_trainC, y_testC = train_test_split(categorical_features_df, 
+                                                    target, 
+                                                    test_size = 0.33, 
+                                                    random_state = 42)
+
+X_train, X_test, y_train, y_test = train_test_split(all_features_df, 
+                                                    target, 
+                                                    test_size = 0.33, 
+                                                    random_state = 42)
+#%% Stratified K-fold: Single model
+
+model1 = Pipeline(steps = [('preprocess', MinMaxScaler())
+                               , ('log_reg', LogisticRegression(class_weight = 'balanced')) ])
+model1
+rs = {'random_state': 42}
+log_reg = LogisticRegression(**rs)
+nb = BernoulliNB()
+clfs = [('Logistic Regression', log_reg)
+        ,('Naive Bayes', nb)]
+
+seed_skf = 42
+skf = StratifiedKFold(n_splits = 10
+                      , shuffle = True
+                      , random_state = seed_skf)
+
+X_array = np.array(numerical_features_df)
+Y = target1
+
+model_scores_df = pd.DataFrame()
+fscoreL      = []
+mccL         = []
+presL        = []
+recallL      = []
+accuL        = []
+roc_aucL     = []
+
+for train_index, test_index in skf.split(X_array, Y):
+    x_train_fold, x_test_fold = X_array[train_index], X_array[test_index]
+    y_train_fold, y_test_fold = Y[train_index], Y[test_index]
+
+    model1.fit(x_train_fold, y_train_fold)
+    y_pred_fold  = model1.predict(x_test_fold)
+  
+    #----------------
+    # Model metrics
+    #----------------     
+    # F1-Score
+    fscore = f1_score(y_test_fold, y_pred_fold)
+    fscoreL.append(fscore)
+    fscoreM = mean(fscoreL)
+     
+    # Matthews correlation coefficient
+    mcc = matthews_corrcoef(y_test_fold, y_pred_fold)
+    mccL.append(mcc)
+    mccM = mean(mccL)
+     
+    # Precision
+    pres = precision_score(y_test_fold, y_pred_fold)
+    presL.append(pres)
+    presM = mean(presL)
+    
+    # Recall
+    recall = recall_score(y_test_fold, y_pred_fold)
+    recallL.append(recall)
+    recallM = mean(recallL)            
+   
+    # Accuracy
+    accu = accuracy_score(y_test_fold, y_pred_fold)
+    accuL.append(accu)            
+    accuM = mean(accuL)
+    
+    # ROC_AUC
+    roc_auc = roc_auc_score(y_test_fold, y_pred_fold)
+    roc_aucL.append(roc_auc)            
+    roc_aucM = mean(roc_aucL)    
+         
+model_scores_df = model_scores_df.append({'Model'      : model1.steps[1][0]
+                                          ,'F1_score'  : fscoreM
+                                          , 'MCC'      : mccM
+                                          , 'Precision': presM
+                                          , 'Recall'   : recallM
+                                          , 'Accuracy' : accuM
+                                          , 'ROC_curve': roc_aucM}
+                                         , ignore_index = True)
+print('\nModel metrics:', model_scores_df)                     
+#%% stratified KFold: Multiple_models: 
+input_df = numerical_features_df
+#X_array = np.array(input_df)
+Y = target1
+var_type = 'numerical'
+
+input_df = all_features_df
+#X_array = np.array(input_df)
+Y = target1
+var_type = 'mixed'
+
+input_df = categorical_features_df
+#X_array = np.array(input_df)
+Y = target1
+var_type = 'categorical'    
+
+#=================
+numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
+numerical_ix
+
+categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns
+categorical_ix    
+
+# Determine preprocessing steps ~ var_type
+if var_type == 'numerical':
+    t = [('num', MinMaxScaler(), numerical_ix)]
+
+if var_type == 'categorical':
+    t = [('cat', OneHotEncoder(), categorical_ix)]
+
+if var_type == 'mixed':
+    t = [('cat', OneHotEncoder(), categorical_ix)
+         , ('num', MinMaxScaler(), numerical_ix)]
+
+##############################   
+col_transform = ColumnTransformer(transformers = t
+                                   , remainder='passthrough')
+
+
+rs = {'random_state': 42}
+
+#log_reg = LogisticRegression(**rs)
+log_reg = LogisticRegression(class_weight = 'balanced')
+nb = BernoulliNB()
+rf = RandomForestClassifier(**rs)
+
+clfs = [('Logistic Regression', log_reg)
+        #,('Naive Bayes', nb)
+        #, ('Random Forest'      , rf) 
+        ]
+
+#seed_skf = 42
+skf = StratifiedKFold(n_splits = 10
+                      , shuffle = True
+                      #, random_state = seed_skf
+                      , **rs)
+#scores_df  = pd.DataFrame()
+fscoreL      = []
+mccL         = []
+presL        = []
+recallL      = []
+accuL        = []
+roc_aucL     = []
+
+for train_index, test_index in skf.split(input_df, Y):
+    print('\nSKF train index:', train_index
+          , '\nSKF test index:', test_index)
+    x_train_fold, x_test_fold = input_df.iloc[train_index], input_df.iloc[test_index]
+    y_train_fold, y_test_fold = Y.iloc[train_index], Y.iloc[test_index]
+# for train_index, test_index in skf.split(X_array, Y):
+#      print('\nSKF train index:', train_index
+#            , '\nSKF test index:', test_index)
+    # x_train_fold, x_test_fold = X_array[train_index], X_array[test_index]
+    # y_train_fold, y_test_fold = Y[train_index], Y[test_index]
+
+
+    clf_scores_df = pd.DataFrame()
+    for clf_name, clf in clfs:   
+        # model2 = Pipeline(steps=[('preprocess', MinMaxScaler())
+        #                            , ('classifier', clf)])
+        model2 = Pipeline(steps=[('preprocess', col_transform)
+                                    , ('classifier', clf)])
+    
+        model2.fit(x_train_fold, y_train_fold)
+        y_pred_fold  = model2.predict(x_test_fold)
+     
+        #----------------
+        # Model metrics
+        #----------------     
+        # F1-Score
+        fscore = f1_score(y_test_fold, y_pred_fold)
+        fscoreL.append(fscore)
+        fscoreM = mean(fscoreL)
+        
+        # Matthews correlation coefficient
+        mcc = matthews_corrcoef(y_test_fold, y_pred_fold)
+        mccL.append(mcc)
+        mccM = mean(mccL)
+        
+        # Precision
+        pres = precision_score(y_test_fold, y_pred_fold)
+        presL.append(pres)
+        presM = mean(presL)
+        
+        # Recall
+        recall = recall_score(y_test_fold, y_pred_fold)
+        recallL.append(recall)
+        recallM = mean(recallL)            
+       
+        # Accuracy
+        accu = accuracy_score(y_test_fold, y_pred_fold)
+        accuL.append(accu)            
+        accuM = mean(accuL)
+        
+        # ROC_AUC
+        roc_auc = roc_auc_score(y_test_fold, y_pred_fold)
+        roc_aucL.append(roc_auc)            
+        roc_aucM = mean(roc_aucL)    
+            
+        clf_scores_df = clf_scores_df.append({'Model': clf_name 
+                                              ,'F1_score'  : fscoreM
+                                              , 'MCC'      : mccM
+                                              , 'Precision': presM
+                                              , 'Recall'   : recallM
+                                              , 'Accuracy' : accuM
+                                              , 'ROC_curve': roc_aucM}
+                                             , ignore_index = True)
+    #scores_df = scores_df.append(clf_scores_df)
+                        
+    
+#%% Call functions
+
+tN_res = MultClassPipeline(X_trainN, X_testN, y_trainN, y_testN)
+tN_res
+
+t2_res = MultClassPipeline2(X_train, X_test, y_train, y_test, input_df = all_features_df)
+t2_res
+
+#CHECK: numbers are awfully close to each other!
+
+t3_res = MultClassPipeSKF(input_df = numerical_features_df
+                          , y_targetF = target1
+                          , var_type = 'numerical'
+                          , skf_splits = 10)
+t3_res
+
+#CHECK: numbers are awfully close to each other!
+t4_res = MultClassPipeSKF(input_df = all_features_df
+                          , y_targetF = target1
+                          , var_type = 'mixed'
+                          , skf_splits = 10)
+t4_res    
--- a/earlier_versions/my_datap11.py
+++ b/earlier_versions/my_datap11.py
@ -0,0 +1,195 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Sat Mar  5 12:57:32 2022
+
+@author: tanu
+"""
+#%%
+# Data, etc for now  comes from my_data6.py and/or my_data5.py
+#%% Specify dir and import functions
+homedir = os.path.expanduser("~")
+os.chdir(homedir + "/git/ML_AI_training/")
+#%% Try combinations
+#import sys, os
+#os.system("imports.py")
+def precision(y_true,y_pred):
+    return precision_score(y_true,y_pred,pos_label = 1)
+def recall(y_true,y_pred):
+    return recall_score(y_true, y_pred, pos_label = 1)
+def f1(y_true,y_pred):
+    return f1_score(y_true, y_pred, pos_label = 1)
+
+#%% Check df features
+numerical_features_df.shape
+categorical_features_df.shape
+all_features_df.shape
+all_features_df.dtypes
+#%% Simple train and test data splits
+target = target1
+#target = target3
+X_trainN, X_testN, y_trainN, y_testN = train_test_split(numerical_features_df, 
+                                                    target, 
+                                                    test_size = 0.33, 
+                                                    random_state = 42)
+
+X_trainC, X_testC, y_trainC, y_testC = train_test_split(categorical_features_df, 
+                                                    target, 
+                                                    test_size = 0.33, 
+                                                    random_state = 42)
+
+X_train, X_test, y_train, y_test = train_test_split(all_features_df, 
+                                                    target, 
+                                                    test_size = 0.33, 
+                                                    random_state = 42)
+#%% Stratified K-fold: Single model
+input_df = numerical_features_df
+#X_array = np.array(input_df)
+var_type = 'numerical'
+
+input_df = all_features_df
+#X_array = np.array(input_df)
+var_type = 'mixed'
+
+input_df = categorical_features_df
+#X_array = np.array(input_df)
+var_type = 'categorical'    
+
+y_targetF = target1
+#==============================================================================
+numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
+numerical_ix
+
+categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns
+categorical_ix    
+
+# Determine preprocessing steps ~ var_type
+if var_type == 'numerical':
+    t = [('num', MinMaxScaler(), numerical_ix)]
+
+if var_type == 'categorical':
+    t = [('cat', OneHotEncoder(), categorical_ix)]
+
+if var_type == 'mixed':
+    t = [('cat', OneHotEncoder(), categorical_ix)
+         , ('num', MinMaxScaler(), numerical_ix)]
+
+###############################################################################
+col_transform = ColumnTransformer(transformers = t
+                                   , remainder='passthrough')
+
+###############################################################################
+rs = {'random_state': 42}
+del(model1)
+
+model1 = Pipeline(steps = [('preprocess', MinMaxScaler())
+                                , ('log_reg', LogisticRegression(class_weight = 'unbalanced')) ])
+
+# model1 = Pipeline(steps = [('preprocess', MinMaxScaler())
+#                                , ('log_reg', LogisticRegression(**rs)) ])
+
+del(model1)
+nb      = BernoulliNB()
+model1 = Pipeline(steps = [('preprocess', MinMaxScaler())
+                                , ('nb', nb) ])
+
+del(model1)
+knn     = KNeighborsClassifier()
+model1 = Pipeline(steps = [('preprocess', MinMaxScaler())
+                                 , ('knn', knn) ])
+del(model1)
+svm     = SVC(**rs)
+model1 = Pipeline(steps = [('preprocess', MinMaxScaler())
+                                , ('svm', svm) ])
+del(model1)
+mlp     = MLPClassifier(max_iter = 500, **rs)
+model1 = Pipeline(steps = [('preprocess', MinMaxScaler())
+                                , ('mlp', mlp) ])
+del(model1)
+dt      = DecisionTreeClassifier(**rs)
+model1 = Pipeline(steps = [('preprocess', MinMaxScaler())
+                                , ('dt', dt) ])
+del(model1)
+et      = ExtraTreesClassifier(**rs)
+model1 = Pipeline(steps = [('preprocess', MinMaxScaler())
+                                , ('et', et) ])
+del(model1)
+rf      = RandomForestClassifier(**rs)
+model1 = Pipeline(steps = [('preprocess', MinMaxScaler())
+                                , ('rf', rf) ])
+###############################################################################
+#%% run 
+del(mm)
+
+skf = StratifiedKFold(n_splits = 10
+                      , shuffle = True
+                      , **rs)
+
+#X_array = np.array(numerical_features_df)
+#Y = target1
+
+model_scores_df = pd.DataFrame()
+fscoreL      = []
+mccL         = []
+presL        = []
+recallL      = []
+accuL        = []
+roc_aucL     = []
+
+# for train_index, test_index in skf.split(X_array, Y):
+#     x_train_fold, x_test_fold = X_array[train_index], X_array[test_index]
+#     y_train_fold, y_test_fold = Y[train_index], Y[test_index]
+for train_index, test_index in skf.split(input_df, y_targetF):
+    x_train_fold, x_test_fold = input_df.iloc[train_index], input_df.iloc[test_index]
+    y_train_fold, y_test_fold = y_targetF.iloc[train_index], y_targetF.iloc[test_index]
+    
+    model1.fit(x_train_fold, y_train_fold)
+    y_pred_fold  = model1.predict(x_test_fold)
+  
+    #----------------
+    # Model metrics
+    #----------------     
+    # F1-Score
+    fscore = f1_score(y_test_fold, y_pred_fold)
+    fscoreL.append(fscore)
+    fscoreM = mean(fscoreL)
+     
+    # Matthews correlation coefficient
+    mcc = matthews_corrcoef(y_test_fold, y_pred_fold)
+    mccL.append(mcc)
+    mccM = mean(mccL)
+     
+    # Precision
+    pres = precision_score(y_test_fold, y_pred_fold)
+    presL.append(pres)
+    presM = mean(presL)
+
+    # Recall
+    recall = recall_score(y_test_fold, y_pred_fold)
+    recallL.append(recall)
+    recallM = mean(recallL)            
+   
+    # Accuracy
+    accu = accuracy_score(y_test_fold, y_pred_fold)
+    accuL.append(accu)            
+    accuM = mean(accuL)
+    
+    # ROC_AUC
+    roc_auc = roc_auc_score(y_test_fold, y_pred_fold)
+    roc_aucL.append(roc_auc)            
+    roc_aucM = mean(roc_aucL)    
+         
+    model_scores_df = model_scores_df.append({'Model'      : model1.steps[1][0]
+                                          ,'F1_score'  : fscoreM
+                                          , 'MCC'      : mccM
+                                          , 'Precision': presM
+                                          , 'Recall'   : recallM
+                                          , 'Accuracy' : accuM
+                                          , 'ROC_curve': roc_aucM}
+                                         , ignore_index = True)
+print('\nModel metrics:\n', model_scores_df)
+mm = model_scores_df.mean()
+
+print('\nModel metrics mean:\n', mm)
+
+print('\nModel metrics:\n', model_scores_df)
--- a/earlier_versions/my_datap2.py
+++ b/earlier_versions/my_datap2.py
@ -0,0 +1,179 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Feb 24 10:48:10 2022
+
+@author: tanu
+"""
+###############################################################################
+# questions:
+ # which data to use: merged_df3 or merged_df2
+ # which is the target? or_mychisq or drtype col
+ # scaling: can it be from -1 to 1?
+ # how to include the mutation information?
+     # 'wild_type', 'mutant', 'postion'
+ # whether to log transform the af and or cols 
+     # to allow mean mode values to be imputed for validation set
+     # whether to calculate mean, median accounting for NA or removing them?
+ 
+# strategy:
+    # available data = X_train
+    # available data but NAN = validation_test
+    # test data: mut generated not in mcsm
+
+###############################################################################
+import os, sys
+import re
+from sklearn.datasets import load_boston
+from sklearn import linear_model
+from sklearn import preprocessing
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+import numpy as np
+print(np.__version__)
+print(pd.__version__)
+#%% read data
+homedir = os.path.expanduser("~")
+os.chdir(homedir + "/git/ML_AI_training/test_data")
+
+# this needs to be merged_df2 or merged_df3?
+my_df = pd.read_csv("pnca_all_params.csv")
+
+my_df.dtypes
+my_df_cols = my_df.columns
+
+omit_cols1 = ['pdb_file'
+             , 'seq_offset4pdb'
+             , 'mut_3upper'
+             , 'wild_pos'
+             , 'wild_chain_pos'
+             , 'chain'
+             , 'wt_3upper'
+             , 'consurf_colour'
+             , 'consurf_colour_rev'
+             , 'consurf_msa_data'
+             , 'consurf_aa_variety'
+             , 'snap2_accuracy_pc'
+             , 'beta_logistic'
+             , 'se_logistic'
+             , 'zval_logisitc'
+             , 'pval_chisq'
+             , 'log10_or_mychisq'
+             , 'neglog_pval_fisher'
+             , 'or_fisher'
+             , 'wild_type'
+             , 'mutant_type'
+             , 'position'
+             , 'ligand_id'
+             , 'mutation'
+             , 'ss'
+             , 'ss_class' # include it later?
+             , 'contacts'
+             ]
+
+omit_cols2 = list(my_df.columns[my_df.columns.str.contains(".*ci_.*") | my_df.columns.str.contains(".*_scaled*") | my_df.columns.str.contains(".*_outcome*")])
+
+# [WATCH:] just to test since these have negative values!
+omit_cols3 = list(my_df.columns[my_df.columns.str.contains("electro_.*") | my_df.columns.str.contains("disulfide_.*") | my_df.columns.str.contains("hbonds_.*") | my_df.columns.str.contains("partcov_.*") | my_df.columns.str.contains("vdwclashes.*") | my_df.columns.str.contains("volumetric.*")])
+
+omit_cols = omit_cols1 + omit_cols2 + omit_cols3
+
+my_df_filt = my_df.loc[:, ~my_df.columns.isin(omit_cols)]
+my_df_filt_cols = my_df_filt.columns
+
+#fill NaNs with column means in each column 
+my_df_filt2 = my_df_filt.fillna(my_df_filt.mean())
+my_df_filt3 = my_df_filt.fillna(my_df_filt.median())
+
+my_df_filt_noNA = my_df_filt.fillna(0)
+
+summ = my_df_filt.describe()
+summ_noNA = my_df_filt_noNA.describe()
+
+foo = my_df_filt['or_mychisq'].value_counts() 
+foo = foo.to_frame()
+
+########################
+# [WATCH]: Drop na
+my_df2 = my_df_filt3.dropna()
+my_df2['resistance'] = my_df2['or_mychisq'].apply(lambda x: 0 if x <=1 else 1)
+my_df2['resistance'].value_counts()
+y = my_df2['resistance']
+y.value_counts()
+
+
+#%%============================================================================
+X_validation_muts = my_df['mutationinformation'][~my_df['mutationinformation'].isin(my_df2['mutationinformation'])]
+X_validation_all = my_df_filt3[~my_df_filt3['mutationinformation'].isin(my_df2['mutationinformation'])]
+X_validation_f = X_validation_all.loc[:, ~X_validation_all.columns.isin(['or_mychisq', 'resistance'])]
+X_validation = X_validation_f.set_index('mutationinformation')
+
+#%% fill na in cols with mean value
+X_validation.info()
+X_validation.isna().any()
+
+na_df = X_validation_f[X_validation_f.columns[X_validation_f.isna().any()]]
+na_colnames = X_validation_f.columns[X_validation_f.isna().any()]
+na_colsL = list(na_colnames)
+
+#==============================================================================
+omit_cols_y = ['or_mychisq', 'resistance']
+my_df_ml = my_df2.loc[:, ~my_df2.columns.isin(omit_cols_y)]
+#%%############################################################################
+X_train = my_df_ml.set_index('mutationinformation')
+#X_train = X_train.iloc[:,:4]
+y_train  = y
+#X_train = X_train.dropna()
+#y_train = y.dropna()
+
+# check dim
+X_train.shape
+y_train.shape
+
+
+###############################################################################
+from sklearn.pipeline import Pipeline
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import cross_validate
+from sklearn.metrics import make_scorer
+from sklearn.metrics import accuracy_score, precision_score, recall_score
+
+model_logisP = Pipeline(steps = [('preprocess', preprocessing.MinMaxScaler())
+                              , ('logis', LogisticRegression(class_weight = 'unbalanced'))
+                          ])
+
+model_logisP.fit(X_train, y_train)
+fitted_vals = model_logisP.predict(X_train)
+fitted_vals
+
+# gives the array of predictions
+model_logisP.predict(X_train)
+model_logisP.predict(X_validation)
+y_pred = model_logisP.predict(X_train)
+y_pred2 = model_logisP.predict(X_validation)
+
+accuracy_score(y_train,y_pred2)
+precision_score(y_train,y_pred2, pos_label = 1)# tp/(tp + fp) 
+recall_score(y_train,y_pred2, pos_label = 1) # tp/(tp + fn)
+
+
+################
+acc = make_scorer(accuracy_score)
+def precision(y_true,y_pred):
+    return precision_score(y_true,y_pred,pos_label = 1) #0
+
+def recall(y_true,y_pred):
+    return recall_score(y_true, y_pred, pos_label = 1) #0
+
+prec = make_scorer(precision)
+rec = make_scorer(recall)
+output = cross_validate(model_logisP
+                        , X_train
+                        , y
+                        , scoring = {'acc' : acc
+                                     ,'prec' : prec
+                                     ,'rec' : rec}
+                        , cv = 10, return_train_score = False)
+
+pd.DataFrame(output).mean()
--- a/earlier_versions/my_datap3.py
+++ b/earlier_versions/my_datap3.py
@ -0,0 +1,376 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Feb 24 10:48:10 2022
+
+@author: tanu
+"""
+###############################################################################
+# questions:
+ # which data to use: merged_df3 or merged_df2
+ # which is the target? or_mychisq or drtype col
+ # scaling: can it be from -1 to 1?
+ # how to include the mutation information?
+     # 'wild_type', 'mutant', 'postion'
+ # whether to log transform the af and or cols 
+     # to allow mean mode values to be imputed for validation set
+     # whether to calculate mean, median accounting for NA or removing them?
+ 
+# strategy:
+    # available data = X_train
+    # available data but NAN = validation_test
+    # test data: mut generated not in mcsm
+
+###############################################################################
+import os, sys
+import re
+from sklearn.datasets import load_boston
+from sklearn import datasets
+from sklearn import linear_model
+from sklearn import preprocessing
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+import numpy as np
+print(np.__version__)
+print(pd.__version__)
+from statistics import mean, stdev
+
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import cross_validate
+from sklearn.metrics import make_scorer
+
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.pipeline import Pipeline
+from sklearn.pipeline import make_pipeline
+from sklearn.datasets import load_digits
+from sklearn.model_selection import train_test_split
+from sklearn.model_selection import GridSearchCV
+from sklearn.metrics import confusion_matrix
+from sklearn.model_selection import StratifiedKFold
+
+from sklearn.preprocessing import OneHotEncoder
+from sklearn.compose import make_column_transformer
+from sklearn.ensemble import RandomForestClassifier
+
+from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score
+
+#%% read data
+homedir = os.path.expanduser("~")
+os.chdir(homedir + "/git/ML_AI_training/test_data")
+
+# this needs to be merged_df2 or merged_df3?
+my_df = pd.read_csv("pnca_all_params.csv")
+
+my_df.dtypes
+my_df_cols = my_df.columns
+
+omit_cols1 = ['pdb_file'
+             , 'seq_offset4pdb'
+             , 'mut_3upper'
+             , 'wild_pos'
+             , 'wild_chain_pos'
+             , 'chain'
+             , 'wt_3upper'
+             , 'consurf_colour'
+             , 'consurf_colour_rev'
+             , 'consurf_msa_data'
+             , 'consurf_aa_variety'
+             , 'snap2_accuracy_pc'
+             , 'beta_logistic'
+             , 'se_logistic'
+             , 'zval_logisitc'
+             , 'pval_chisq'
+             , 'log10_or_mychisq'
+             , 'neglog_pval_fisher'
+             , 'or_fisher'
+             , 'wild_type'
+             , 'mutant_type'
+             , 'position'
+             , 'ligand_id'
+             , 'mutation'
+             , 'ss'
+             , 'ss_class' # include it later?
+             , 'contacts'
+             ]
+
+omit_cols2 = list(my_df.columns[my_df.columns.str.contains(".*ci_.*") | my_df.columns.str.contains(".*_scaled*") | my_df.columns.str.contains(".*_outcome*")])
+
+# [WATCH:] just to test since these have negative values!
+omit_cols3 = list(my_df.columns[my_df.columns.str.contains("electro_.*") | my_df.columns.str.contains("disulfide_.*") | my_df.columns.str.contains("hbonds_.*") | my_df.columns.str.contains("partcov_.*") | my_df.columns.str.contains("vdwclashes.*") | my_df.columns.str.contains("volumetric.*")])
+
+omit_cols = omit_cols1 + omit_cols2 + omit_cols3
+
+# Filter df: Filter columns to focus on my selected ones
+my_df_filt = my_df.loc[:, ~my_df.columns.isin(omit_cols)]
+my_df_filt_cols = my_df_filt.columns
+
+#Fill na of filtered df: fill NaNs with column means/medians in each column 
+my_df_filt2 = my_df_filt.fillna(my_df_filt.mean())
+my_df_filt3 = my_df_filt.fillna(my_df_filt.median())
+#my_df_filt_noNA = my_df_filt.fillna(0)
+
+summ = my_df_filt.describe()
+summ2 = my_df_filt2.describe()
+summ3 = my_df_filt3.describe()
+#summ_noNA = my_df_filt_noNA.describe()
+
+########################
+# [WATCH]: Drop na
+# Get Y
+my_df2 = my_df_filt.dropna()
+my_df2['resistance'] = my_df2['or_mychisq'].apply(lambda x: 0 if x <=1 else 1)
+my_df2['resistance'].value_counts()
+Y = my_df2['resistance']
+Y = np.array(Y)
+#Y = Y.reset_index()
+#Y = Y.drop(['index'], axis = 1)
+#Y.value_counts()
+#Y = np.array(Y)
+
+# GET X
+omit_cols_y = ['or_mychisq', 'resistance']
+my_df_ml = my_df2.loc[:, ~my_df2.columns.isin(omit_cols_y)]
+#my_df_ml = my_df_ml.set_index('mutationinformation')
+X = my_df_ml
+X = X.drop(['mutationinformation'], axis = 1)
+X = np.array(X)
+
+#X = X.reset_index()
+
+
+# check dim
+X.shape
+Y.shape
+my_df2 = my_df2.reset_index()
+
+#####################
+#https://stackoverflow.com/questions/49134338/kfolds-cross-validation-vs-train-test-split
+rf = RandomForestClassifier(n_estimators=100, random_state=42)
+
+#https://towardsdatascience.com/stratified-k-fold-what-it-is-how-to-use-it-cf3d107d3ea2
+# k-FOLD
+print('Class Ratio:',
+       sum(Y)/len(Y))
+
+print('Class Ratio:',
+       sum(my_df2['resistance'])/len(my_df2['resistance'])
+       )
+
+skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
+target = my_df2.loc[:,'resistance']
+
+fold_no = 1
+for train_index, test_index in skf.split(my_df2, target):
+    train = my_df2.loc[train_index,:]
+    test = my_df2.loc[test_index,:]
+    print('Fold',str(fold_no),
+          'Class Ratio:',
+          sum(test['resistance'])/len(test['resistance']))
+    fold_no += 1
+
+model_logisP = Pipeline(steps = [('preprocess', preprocessing.MinMaxScaler())
+                              , ('logis', LogisticRegression(class_weight = 'unbalanced'))
+                          ])
+
+X_features = X_train.columns.to_list()
+
+def train_model(train, test, fold_no):
+   X = X_features
+   y = ['resistance']
+   X_train = train[X]
+   y_train = train[y]
+   X_test = test[X]
+   y_test = test[y]
+   model_logisP.fit(X_train,y_train)
+   predictions = model_logisP.predict(X_test)
+   print('Fold',str(fold_no),
+         'Accuracy:',
+         accuracy_score(y_test,predictions))
+   
+   
+fold_no = 1
+for train_index, test_index in skf.split(my_df2, target):
+    train = my_df2.loc[train_index,:]
+    test  = my_df2.loc[test_index,:]
+    train_model(train,test,fold_no)
+    fold_no += 1
+#%%
+skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=20)
+lst_accu_stratified = []
+scaler = preprocessing.MinMaxScaler()
+X_scaled = scaler.fit_transform(X)
+X_scaled = X_scaled[:,[1,2,3,15,16]]
+
+#lr = linear_model.LogisticRegression(class_weight = 'unbalanced')
+lr = linear_model.LogisticRegression()
+
+for train_index1, test_index1 in skf.split(X, Y):
+    #print(train_index)
+    #print(test_index)
+    x_train_fold1, x_test_fold1 = X_scaled[train_index1], X_scaled[test_index1]
+    y_train_fold1, y_test_fold1 = Y[train_index1], Y[test_index1]
+    lr.fit(x_train_fold1, y_train_fold1)
+    lst_accu_stratified.append(lr.score(x_test_fold1, y_test_fold1))
+
+# print output
+print('List of possible accuracy', lst_accu_stratified)
+print('Max accuracy:', max(lst_accu_stratified)*100, "%")
+print('Min accuracy:', min(lst_accu_stratified)*100, "%")
+print('Mean accuracy:', mean(lst_accu_stratified)*100,"%")
+print('St Dev:', stdev(lst_accu_stratified)*100,"%")
+
+    
+# cancer data
+cancer = datasets.load_breast_cancer()
+x = cancer.data
+y = cancer.target
+skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
+lst_accu_stratifiedC = []
+scaler = preprocessing.MinMaxScaler()
+x_scaled = scaler.fit_transform(x)
+x_scaled = x_scaled[:,[1,2,3, 15, 16]]
+
+for train_index, test_index in skf.split(x, y):
+    #print(train_index)
+    #print(test_index)
+    x_train_fold, x_test_fold = x_scaled[train_index], x_scaled[test_index]
+    y_train_fold, y_test_fold = y[train_index], y[test_index]
+    lr.fit(x_train_fold, y_train_fold)
+    lst_accu_stratifiedC.append(lr.score(x_test_fold, y_test_fold))
+    
+# print output
+print('List of possible accuracy', lst_accu_stratifiedC)
+print('Max accuracy:', max(lst_accu_stratifiedC)*100, "%")
+print('Min accuracy:', min(lst_accu_stratifiedC)*100, "%")
+print('Mean accuracy:', mean(lst_accu_stratifiedC)*100,"%")
+print('St Dev:', stdev(lst_accu_stratifiedC)*100,"%")
+   
+#%%
+##
+# https://towardsdatascience.com/my-random-forest-classifier-cheat-sheet-in-python-fedb84f8cf4f
+y_all = my_df_filt['or_mychisq'].apply(lambda x: 0 if x <=1 else 1)            
+X_all = my_df_filt.drop(['mutationinformation', 'or_mychisq'], axis = 1)
+seed = 20  # so that the result is reproducible
+
+X_all = my_df_filt.drop(['mutationinformation', 'or_mychisq'], axis = 1)
+X_all =  X_all.iloc[:,:6]
+
+X_train, X_test, y_train, y_test = train_test_split(X_all,y_all
+                                                    , test_size=0.333
+                                                    , random_state = seed)
+# Now, it is time to make NA a category.
+# In Python, NaN is considered NAs. 
+# When encoded, those NaN will be ignored. 
+# Hence, it is useful to replace NaN with na, which is now a category called ‘na’. 
+# This will be taken into account when encoding later on.
+#X_train = X_train.fillna('na')
+#X_test = X_test.fillna('na')
+
+X_train = X_train.fillna(X_train.median())
+X_test = X_test.fillna(X_test.median())
+
+X_train.dtypes
+
+features_to_encode = list(X_train.select_dtypes(include = ['object']).columns) 
+
+col_trans = make_column_transformer(
+                        (OneHotEncoder(),features_to_encode),
+                        remainder = "passthrough"
+                        )
+
+rf_classifier = RandomForestClassifier(
+                      min_samples_leaf=50,
+                      n_estimators=150,
+                      bootstrap=True,
+                      oob_score=True,
+                      n_jobs=-1,
+                      random_state=seed,
+                      max_features='auto')
+
+pipe = make_pipeline(col_trans, rf_classifier)
+pipe.fit(X_train, y_train)
+y_pred = pipe.predict(X_test)
+
+accuracy_score(y_test, y_pred)
+print(f"The accuracy of the model is {round(accuracy_score(y_test,y_pred),3)*100} %")
+
+recall_score(y_test, y_pred)
+precision_score(y_test, y_pred)
+f1_score(y_test, y_pred)
+roc_auc_score (y_test, y_pred)
+roc_curve(y_test, y_pred)
+
+train_probs = pipe.predict_proba(X_train)[:,1] 
+probs = pipe.predict_proba(X_test)[:, 1]
+train_predictions = pipe.predict(X_train)
+
+print(f'Train ROC AUC Score: {roc_auc_score(y_train, train_probs)}')
+print(f'Test ROC AUC  Score: {roc_auc_score(y_test, probs)}')
+
+def evaluate_model(y_pred, probs,train_predictions, train_probs):
+    baseline = {}
+    baseline['recall']=recall_score(y_test,
+                                    [1 for _ in range(len(y_test))])
+    baseline['precision'] = precision_score(y_test,
+                                    [1 for _ in range(len(y_test))])
+    baseline['roc'] = 0.5
+    results = {}
+    results['recall'] = recall_score(y_test, y_pred)
+    results['precision'] = precision_score(y_test, y_pred)
+    results['roc'] = roc_auc_score(y_test, probs)
+    train_results = {}
+    train_results['recall'] = recall_score(y_train,
+                                           train_predictions)
+    train_results['precision'] = precision_score(y_train, train_predictions)
+    train_results['roc'] = roc_auc_score(y_train, train_probs)
+    # for metric in ['recall', 'precision', 'roc']:
+    #         print(f"Baseline: {round(baseline[metric], 2)}Test: {round(results[metric], 2)} Train: {round(train_results[metric], 2)}")
+                 
+   # Calculate false positive rates and true positive rates
+    base_fpr, base_tpr, _ = roc_curve(y_test, [1 for _ in range(len(y_test))])
+    model_fpr, model_tpr, _ = roc_curve(y_test, probs)
+    plt.figure(figsize = (8, 6))
+    plt.rcParams['font.size'] = 16    
+    # Plot both curves
+    plt.plot(base_fpr, base_tpr, 'b', label = 'baseline')
+    plt.plot(model_fpr, model_tpr, 'r', label = 'model')
+    plt.legend();    plt.xlabel('False Positive Rate');
+    plt.ylabel('True Positive Rate'); plt.title('ROC Curves');
+    plt.show()
+    
+# Recall Baseline: 1.0 Test: 0.92 Train: 0.93 
+# Precision Baseline: 0.48 Test: 0.9 Train: 0.91 
+# Roc Baseline: 0.5 Test: 0.97 Train: 0.97
+
+evaluate_model(y_pred,probs,train_predictions,train_probs)
+#%%
+import itertools
+def plot_confusion_matrix(cm, classes, normalize = False,
+                          title='Confusion matrix',
+                          cmap=plt.cm.Greens): # can change color
+    plt.figure(figsize = (10, 10))
+    plt.imshow(cm, interpolation='nearest', cmap=cmap)
+    plt.title(title, size = 24)
+    plt.colorbar(aspect=4)
+    tick_marks = np.arange(len(classes))
+    plt.xticks(tick_marks, classes, rotation=45, size = 14)
+    plt.yticks(tick_marks, classes, size = 14)
+    fmt = '.2f' if normalize else 'd'
+    thresh = cm.max() / 2.
+    # Label the plot 
+    for i, j in itertools.product(range(cm.shape[0]),   range(cm.shape[1])):    plt.text(j, i, format(cm[i, j], fmt), 
+             fontsize = 20,
+             horizontalalignment="center",
+             color="white" if cm[i, j] > thresh else "black")
+    plt.grid(None)
+    plt.tight_layout()
+    plt.ylabel('True label', size = 18)
+    plt.xlabel('Predicted label', size = 18)
+# Let's plot it out
+cm = confusion_matrix(y_test, y_pred)
+plot_confusion_matrix(cm, classes = ['0 - Susceptible', '1 - Resistant'],
+                      title = 'R/S Confusion Matrix')
+
+print(rf_classifier.feature_importances_)
+print(f" There are {len(rf_classifier.feature_importances_)} features in total")
--- a/earlier_versions/my_datap4.py
+++ b/earlier_versions/my_datap4.py
@ -0,0 +1,361 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Feb 24 10:48:10 2022
+
+@author: tanu
+"""
+###############################################################################
+# questions:
+ # which data to use: merged_df3 or merged_df2
+ # which is the target? or_mychisq or drtype col
+ # scaling: can it be from -1 to 1?
+ # how to include the mutation information?
+     # 'wild_type', 'mutant', 'postion'
+ # whether to log transform the af and or cols 
+     # to allow mean mode values to be imputed for validation set
+     # whether to calculate mean, median accounting for NA or removing them?
+ 
+# strategy:
+    # available data = X_train
+    # available data but NAN = validation_test
+    # test data: mut generated not in mcsm
+
+###############################################################################
+import os, sys
+import re
+from sklearn.datasets import load_boston
+from sklearn import datasets
+from sklearn import linear_model
+from sklearn import preprocessing
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+import numpy as np
+print(np.__version__)
+print(pd.__version__)
+from statistics import mean, stdev
+
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import cross_validate
+from sklearn.metrics import make_scorer
+
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.pipeline import Pipeline
+from sklearn.pipeline import make_pipeline
+from sklearn.datasets import load_digits
+from sklearn.model_selection import train_test_split
+from sklearn.model_selection import GridSearchCV
+from sklearn.metrics import confusion_matrix
+from sklearn.model_selection import StratifiedKFold
+
+from sklearn.preprocessing import OneHotEncoder
+from sklearn.compose import make_column_transformer
+from sklearn.ensemble import RandomForestClassifier
+
+from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score
+from sklearn.metrics import plot_precision_recall_curve
+import itertools
+#%% read data
+homedir = os.path.expanduser("~")
+os.chdir(homedir + "/git/ML_AI_training/test_data")
+
+# this needs to be merged_df2 or merged_df3?
+#gene 'pncA'
+drug = 'pyrazinamide'
+
+my_df = pd.read_csv("pnca_merged_df3.csv")
+
+my_df.dtypes
+my_df_cols = my_df.columns
+
+#%%
+# GET Y
+# Y = my_df.loc[:,drug] #has NA
+dm_om_map = {'DM': 1, 'OM': 0}
+my_df['resistance'] = my_df['mutation_info_labels'].map(dm_om_map)
+
+# sanity check
+my_df['resistance'].value_counts()
+my_df['mutation_info_labels'].value_counts()
+
+Y = my_df['resistance']
+
+#%%
+# GET X
+cols = my_df.columns
+X = my_df[['ligand_distance'
+           , 'ligand_affinity_change'
+           , 'duet_stability_change', 'ddg_foldx', 'deepddg', 'ddg_dynamut2', 'consurf_score'
+           , 'snap2_score'
+           #, 'snap2_accuracy_pc'
+           , 'asa'
+           , 'rsa']]
+
+#%%
+####################################
+# SIMPLEST case of train_test split
+# Random forest
+# one hot encoder
+# MinMaxScaler
+# https://towardsdatascience.com/my-random-forest-classifier-cheat-sheet-in-python-fedb84f8cf4f
+####################################
+seed = 50
+X_train, X_test, y_train, y_test = train_test_split(X,Y
+                                                    , test_size    = 0.333
+                                                    , random_state = seed)
+
+features_to_encode = list(X_train.select_dtypes(include = ['object']).columns) 
+
+col_trans = make_column_transformer(
+                        (OneHotEncoder(),features_to_encode),
+                        remainder = "passthrough"
+                        )
+MinMaxS = preprocessing.MinMaxScaler()
+standardS = preprocessing.StandardScaler()
+
+rf_classifier = RandomForestClassifier(
+                      min_samples_leaf=50,
+                      n_estimators=150,
+                      bootstrap=True,
+                      oob_score=True,
+                      n_jobs=-1,
+                      random_state=seed,
+                      max_features='auto')
+
+pipe = make_pipeline(col_trans
+                     #, MinMaxS
+                     #, standardS
+                     , rf_classifier)
+
+pipe.fit(X_train, y_train)
+y_pred = pipe.predict(X_test)                     
+accuracy_score(y_test, y_pred)
+
+print("\nModel evaluation:\n")
+print(f"Accuracy: {round(accuracy_score(y_test,y_pred),3)*100} %")
+print(f"Recall: {round(recall_score(y_test,y_pred),3)*100} %")
+print(f"Precision: {round(precision_score(y_test,y_pred),3)*100} %")
+print(f"F1-score: {round(f1_score(y_test,y_pred),3)*100} %")
+
+recall_score(y_test, y_pred)
+precision_score(y_test, y_pred)
+f1_score(y_test, y_pred)
+roc_auc_score (y_test, y_pred) # not sure!
+roc_curve(y_test, y_pred) # not sure!
+
+disp = plot_precision_recall_curve(pipe, X_test, y_test)
+
+train_probs = pipe.predict_proba(X_train)[:,1] 
+probs = pipe.predict_proba(X_test)[:, 1]
+train_predictions = pipe.predict(X_train)
+
+print(f'Train ROC AUC Score: {roc_auc_score(y_train, train_probs)}')
+print(f'Test ROC AUC  Score: {roc_auc_score(y_test, probs)}')
+
+def evaluate_model(y_pred, probs,train_predictions, train_probs):
+    baseline = {}
+    baseline['recall']=recall_score(y_test,
+                                    [1 for _ in range(len(y_test))])
+    baseline['precision'] = precision_score(y_test,
+                                    [1 for _ in range(len(y_test))])
+    baseline['roc'] = 0.5
+    results = {}
+    results['recall'] = recall_score(y_test, y_pred)
+    results['precision'] = precision_score(y_test, y_pred)
+    results['roc'] = roc_auc_score(y_test, probs)
+    train_results = {}
+    train_results['recall'] = recall_score(y_train,
+                                           train_predictions)
+    train_results['precision'] = precision_score(y_train, train_predictions)
+    train_results['roc'] = roc_auc_score(y_train, train_probs)
+    # for metric in ['recall', 'precision', 'roc']:
+    #         print(f"Baseline: {round(baseline[metric], 2)}Test: {round(results[metric], 2)} Train: {round(train_results[metric], 2)}")
+                 
+   # Calculate false positive rates and true positive rates
+    base_fpr, base_tpr, _ = roc_curve(y_test, [1 for _ in range(len(y_test))])
+    model_fpr, model_tpr, _ = roc_curve(y_test, probs)
+    plt.figure(figsize = (8, 6))
+    plt.rcParams['font.size'] = 16    
+    # Plot both curves
+    plt.plot(base_fpr, base_tpr, 'b', label = 'baseline')
+    plt.plot(model_fpr, model_tpr, 'r', label = 'model')
+    plt.legend();    plt.xlabel('False Positive Rate');
+    plt.ylabel('True Positive Rate'); plt.title('ROC Curves');
+    plt.show()
+    
+# Recall Baseline: 1.0 Test: 0.92 Train: 0.93 
+# Precision Baseline: 0.48 Test: 0.9 Train: 0.91 
+# Roc Baseline: 0.5 Test: 0.97 Train: 0.97
+
+evaluate_model(y_pred,probs,train_predictions,train_probs)
+
+def plot_confusion_matrix(cm, classes, normalize = False,
+                          title='Confusion matrix',
+                          cmap=plt.cm.Greens): # can change color
+    plt.figure(figsize = (10, 10))
+    plt.imshow(cm, interpolation='nearest', cmap=cmap)
+    plt.title(title, size = 24)
+    plt.colorbar(aspect=4)
+    tick_marks = np.arange(len(classes))
+    plt.xticks(tick_marks, classes, rotation=45, size = 14)
+    plt.yticks(tick_marks, classes, size = 14)
+    fmt = '.2f' if normalize else 'd'
+    thresh = cm.max() / 2.
+    # Label the plot 
+    for i, j in itertools.product(range(cm.shape[0]),   range(cm.shape[1])):    plt.text(j, i, format(cm[i, j], fmt), 
+             fontsize = 20,
+             horizontalalignment="center",
+             color="white" if cm[i, j] > thresh else "black")
+    plt.grid(None)
+    plt.tight_layout()
+    plt.ylabel('True label', size = 18)
+    plt.xlabel('Predicted label', size = 18)
+# Let's plot it out
+cm = confusion_matrix(y_test, y_pred)
+plot_confusion_matrix(cm, classes = ['0 - Susceptible', '1 - Resistant'],
+                      title = 'R/S Confusion Matrix')
+
+print(rf_classifier.feature_importances_)
+print(f" There are {len(rf_classifier.feature_importances_)} features in total")
+#%%
+####################################
+# Model 2: case of stratified K-fold
+# Logistic regression
+# MinMaxScaler
+# https://towardsdatascience.com/stratified-k-fold-what-it-is-how-to-use-it-cf3d107d3ea2 [ Didn't work!]
+# https://www.geeksforgeeks.org/stratified-k-fold-cross-validation/
+####################################
+print('Class Ratio:',
+       sum(Y)/len(Y))
+
+print('Class Ratio:',
+       sum(my_df['resistance'])/len(my_df['resistance']))
+
+seed_skf = 50
+skf = StratifiedKFold(n_splits = 10
+                      , shuffle = True
+                      , random_state = seed_skf)
+
+lst_accu_stratified = []
+scaler = preprocessing.MinMaxScaler()
+X_scaled = scaler.fit_transform(X)
+#X_scaled = X_scaled[:,[1,2,3]]
+
+#lr = linear_model.LogisticRegression(class_weight = 'unbalanced')
+lr = linear_model.LogisticRegression()
+
+for train_index, test_index in skf.split(X, Y):
+    #print(train_index)
+    #print(test_index)
+    x_train_fold, x_test_fold = X_scaled[train_index], X_scaled[test_index]
+    y_train_fold, y_test_fold = Y[train_index], Y[test_index]
+    lr.fit(x_train_fold, y_train_fold)
+    lst_accu_stratified.append(lr.score(x_test_fold, y_test_fold))
+
+# print output
+print('List of possible accuracy', lst_accu_stratified)
+print('Max accuracy:', max(lst_accu_stratified)*100, "%")
+print('Min accuracy:', min(lst_accu_stratified)*100, "%")
+print('Mean accuracy:', mean(lst_accu_stratified)*100,"%")
+print('St Dev:', stdev(lst_accu_stratified)*100,"%")    
+
+#%%
+#--------------------------------------
+# Model2.1: same one but with pipeline
+# slightly different results when using 
+# transformed or untransformed values!
+#--------------------------------------
+model_logisP = Pipeline(steps = [('preprocess', preprocessing.MinMaxScaler())
+                              , ('logis', LogisticRegression(class_weight = 'unbalanced')) ]) # changes stdev
+seed_skf = 50
+skf = StratifiedKFold(n_splits = 10
+                      , shuffle = True
+                      , random_state = seed_skf)
+
+X_array = np.array(X)
+lst_accu_stratified = []
+for train_index, test_index in skf.split(X_array, Y):
+    x_train_fold, x_test_fold = X_array[train_index], X_array[test_index]
+    y_train_fold, y_test_fold = Y[train_index], Y[test_index]
+    model_logisP.fit(x_train_fold, y_train_fold)
+    lst_accu_stratified.append(model_logisP.score(x_test_fold, y_test_fold))
+
+# print output
+print('List of possible accuracy', lst_accu_stratified)
+print('Max accuracy:', max(lst_accu_stratified)*100, "%")
+print('Min accuracy:', min(lst_accu_stratified)*100, "%")
+print('Mean accuracy:', mean(lst_accu_stratified)*100,"%")
+print('St Dev:', stdev(lst_accu_stratified)*100,"%")    
+
+####################################
+# Model 3: stratified K-fold
+# Random forest
+# MinMaxScaler
+# X: needs to be an array for str Kfold
+####################################
+
+model_rf = Pipeline(steps = [('preprocess', preprocessing.MinMaxScaler())
+                              , ('rf'     , RandomForestClassifier(n_estimators=100, random_state=42))])
+seed_skf = 50
+skf = StratifiedKFold(n_splits = 10
+                      , shuffle = True
+                      , random_state = seed_skf)
+
+X_array = np.array(X)
+lst_accu_stratified_rf = []
+for train_index, test_index in skf.split(X_array, Y):
+    x_train_fold, x_test_fold = X_array[train_index], X_array[test_index]
+    y_train_fold, y_test_fold = Y[train_index], Y[test_index]
+    model_rf.fit(x_train_fold, y_train_fold)
+    lst_accu_stratified_rf.append(model_rf.score(x_test_fold, y_test_fold))
+
+# print output
+print('List of possible accuracy', lst_accu_stratified_rf)
+print('Max accuracy:', max(lst_accu_stratified_rf)*100, "%")
+print('Min accuracy:', min(lst_accu_stratified_rf)*100, "%")
+print('Mean accuracy:', mean(lst_accu_stratified_rf)*100,"%")
+print('St Dev:', stdev(lst_accu_stratified_rf)*100,"%")    
+
+####################################
+# Model 4: Cross validate K-fold
+# Random forest
+# MinMaxScaler
+# X: needs to be an array for Kfold
+# FIXME: DOESNT WORK BECAUSE MSE is for LR, not Logistic or random?
+####################################
+from sklearn.metrics import mean_squared_error, make_scorer
+from sklearn.model_selection import cross_validate
+
+score_fn = make_scorer(mean_squared_error)
+scores = cross_validate(model_rf, X_train, y_train
+                        , scoring = score_fn
+                        , cv = 10)
+
+from itertools import combinations
+def train(X):
+    return cross_validate(model_rf, X, y_train
+                          , scoring = score_fn 
+                          , cv = 10
+                        , return_estimator = True)['test_score']
+scores = [train(X_train.loc[:,vars]) for vars in combinations(X_train.columns,11)]
+means = [score.mean() for score in scores]
+#%%
+# https://stackoverflow.com/questions/52316237/finding-logistic-regression-weights-from-k-fold-cv
+from sklearn.linear_model import LogisticRegressionCV
+from sklearn.model_selection import KFold
+kf = KFold(n_splits=10, shuffle=True, random_state=42)
+logistic = LogisticRegressionCV(Cs=2, fit_intercept=True, cv=kf, verbose =1, random_state=42)
+logistic.fit(X_train, y_train)
+print("Train Coefficient:" , logistic.coef_) #weights of each feature
+print("Train Intercept:" , logistic.intercept_) #value of intercept
+#%%
+# https://machinelearningmastery.com/repeated-k-fold-cross-validation-with-python/
+from sklearn.model_selection import cross_val_score
+from numpy import std
+cv = KFold(n_splits=10, random_state=1, shuffle=True)
+scores = cross_val_score(model_rf, X,Y, scoring='accuracy', cv=cv, n_jobs=-1)
+scores2 = cross_val_score(model_logisP, X, Y, scoring='accuracy', cv=cv, n_jobs=-1)
+# report performance
+print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))
+print('Accuracy: %.3f (%.3f)' % (mean(scores2), stdev(scores2)))
--- a/earlier_versions/my_datap5.py
+++ b/earlier_versions/my_datap5.py
@ -0,0 +1,172 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Mar  3 17:08:18 2022
+
+@author: tanu
+"""
+from sklearn.linear_model import LogisticRegression
+from sklearn.naive_bayes import BernoulliNB
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.svm import SVC
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
+from sklearn.neural_network import MLPClassifier
+from sklearn.pipeline import Pipeline
+from xgboost import XGBClassifier
+from sklearn.preprocessing import StandardScaler
+from sklearn.preprocessing import MinMaxScaler
+from sklearn.model_selection import train_test_split
+import os
+from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score
+import pandas as pd
+#%%
+homedir = os.path.expanduser("~")
+os.chdir(homedir + "/git/ML_AI_training/test_data")
+
+#gene 'pncA'
+#drug = 'pyrazinamide'
+
+#==============
+# directories
+#==============
+datadir = homedir + '/git/Data/'
+indir = datadir + drug + '/input/'
+outdir = datadir + drug + '/output/'
+
+#=======
+# input
+#=======
+# this needs to be merged_df2 or merged_df3?
+infile_ml1 = outdir + gene.lower() + '_merged_df3.csv' 
+#infile_ml2 = outdir + gene.lower() + '_merged_df2.csv'
+
+my_df = pd.read_csv(infile_ml1)
+my_df.dtypes
+my_df_cols = my_df.columns
+
+gene_baiscL = ['pnca']
+geneL_naL   = ['gid', 'rpob']
+geneL_ppi2L = ['alr', 'embb', 'katg', 'rpob']
+#%%============================================================================
+# GET Y
+# Y = my_df.loc[:,drug] #has NA
+dm_om_map = {'DM': 1, 'OM': 0}
+my_df['resistance'] = my_df['mutation_info_labels'].map(dm_om_map)
+
+# sanity check
+my_df['resistance'].value_counts()
+my_df['mutation_info_labels'].value_counts()
+Y = my_df['resistance']
+
+# GET X
+cols = my_df.columns
+X_stability = my_df[['ligand_distance'
+           , 'ligand_affinity_change'
+           , 'duet_stability_change'
+           , 'ddg_foldx'
+           , 'deepddg'
+           , 'ddg_dynamut2']]
+
+X_evol =  my_df[['consurf_score'
+           , 'snap2_score'
+           , 'snap2_accuracy_pc']]
+
+X_str =  my_df[['asa'
+           , 'rsa'
+           , 'kd_values'
+           , 'rd_values']]
+
+#%% try combinations
+X_vars = X_stability
+X_vars = X_evol
+X_vars = X_str
+
+X_vars = pd.concat([X_stability, X_evol, X_str], axis = 1)
+X_vars = pd.concat([X_stability, X_evol], axis = 1)
+X_vars = pd.concat([X_stability, X_str], axis = 1)
+X_vars = pd.concat([X_evol, X_str], axis = 1)
+
+#%%
+X_vars.shape[1]
+
+# TODO: stratified cross validate
+# Train-test Split
+rs = {'random_state': 42}
+X_train, X_test, y_train, y_test = train_test_split(X_vars, 
+                                                    Y, 
+                                                    test_size = 0.33, 
+                                                    random_state = 42)
+
+# Classification - Model Pipeline
+def modelPipeline(X_train, X_test, y_train, y_test):
+
+    log_reg = LogisticRegression(**rs)
+    nb = BernoulliNB()
+    knn = KNeighborsClassifier()
+    svm = SVC(**rs)
+    mlp = MLPClassifier(max_iter=500, **rs)
+    dt = DecisionTreeClassifier(**rs)
+    et = ExtraTreesClassifier(**rs)
+    rf = RandomForestClassifier(**rs)
+    xgb = XGBClassifier(**rs, verbosity=0)
+
+    clfs = [
+            ('Logistic Regression', log_reg), 
+            ('Naive Bayes', nb),
+            ('K-Nearest Neighbors', knn), 
+            ('SVM', svm), 
+            ('MLP', mlp), 
+            ('Decision Tree', dt), 
+            ('Extra Trees', et), 
+            ('Random Forest', rf), 
+            ('XGBoost', xgb)
+            ]
+
+
+    pipelines = []
+
+    scores_df = pd.DataFrame(columns=['Model', 'F1_Score', 'Precision', 'Recall', 'Accuracy', 'ROC_AUC'])
+
+
+    for clf_name, clf in clfs:
+
+        pipeline = Pipeline(steps=[
+                                   ('scaler', MinMaxScaler()),
+                                   ('classifier', clf)
+                                   ]
+                            )
+        pipeline.fit(X_train, y_train)
+
+        # Model predictions
+        y_pred  = pipeline.predict(X_test)
+        
+        # F1-Score
+        fscore  = f1_score(y_test, y_pred)
+        # Precision
+        pres    = precision_score(y_test, y_pred)
+        # Recall
+        rcall   = recall_score(y_test, y_pred)
+        # Accuracy
+        accu    = accuracy_score(y_test, y_pred)
+        # ROC_AUC
+        roc_auc = roc_auc_score(y_test, y_pred)
+
+
+        pipelines.append(pipeline)
+
+        scores_df = scores_df.append({
+                                      'Model'     : clf_name, 
+                                      'F1_Score'  : fscore,
+                                      'Precision' : pres,
+                                      'Recall'    : rcall,
+                                      'Accuracy'  : accu,
+                                      'ROC_AUC'   : roc_auc
+                                      
+                                      }, 
+                                     ignore_index = True)
+        
+    return pipelines, scores_df
+
+
+modelPipeline(X_train, X_test, y_train, y_test)
--- a/earlier_versions/my_datap6.py
+++ b/earlier_versions/my_datap6.py
@ -0,0 +1,207 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Mar  4 14:54:30 2022
+
+@author: tanu
+"""
+import os, sys
+import pandas as pd
+import numpy as np
+from sklearn.linear_model import LogisticRegression
+from sklearn.naive_bayes import BernoulliNB
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.svm import SVC
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
+from sklearn.neural_network import MLPClassifier
+from sklearn.pipeline import Pipeline
+from xgboost import XGBClassifier
+from sklearn.preprocessing import StandardScaler
+from sklearn.preprocessing import MinMaxScaler
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score
+#%%
+homedir = os.path.expanduser("~")
+os.chdir(homedir + "/git/ML_AI_training/")
+
+# my function
+from MultClassPipe import MultClassPipeline 
+
+gene = 'pncA'
+drug = 'pyrazinamide'
+
+#==============
+# directories
+#==============
+datadir = homedir + '/git/Data/'
+indir   = datadir + drug + '/input/'
+outdir  = datadir + drug + '/output/'
+
+#=======
+# input
+#=======
+infile_ml1 = outdir + gene.lower() + '_merged_df3.csv' 
+#infile_ml2 = outdir + gene.lower() + '_merged_df2.csv'
+
+my_df = pd.read_csv(infile_ml1)
+my_df.dtypes
+my_df_cols = my_df.columns
+
+geneL_basic     = ['pnca']
+geneL_na        = ['gid']
+geneL_na_ppi2   = ['rpob']
+geneL_ppi2      = ['alr', 'embb', 'katg']
+#%% get cols
+mycols = my_df.columns
+
+#%%============================================================================
+# GET Y
+
+# Target1: mutation_info_labels
+dm_om_map = {'DM': 1, 'OM': 0}
+target1 = my_df['mutation_info_labels'].map(dm_om_map)
+
+# Target2: drug
+drug_labels = drug + '_labels'
+drug_labels
+my_df[drug_labels] = my_df[drug].map({1: 'resistant', 0: 'sensitive'})
+my_df[drug_labels].value_counts()
+my_df[drug_labels] = my_df[drug_labels].fillna('unknown')
+my_df[drug_labels].value_counts()
+target2 = my_df[drug_labels]
+
+# Target3: drtype
+drtype_labels = 'drtype_labels'
+my_df[drtype_labels] = my_df['drtype'].map({'Sensitive' : 0
+                                                 , 'Other'   : 0
+                                                 , 'Pre-MDR' : 1
+                                                 , 'MDR'     : 1
+                                                 , 'Pre-XDR' : 1
+                                                 , 'XDR'     : 1})
+# target3 = my_df['drtype']
+target3 = my_df[drtype_labels]
+
+# target4
+drtype_labels2 = 'drtype_labels2'
+my_df[drtype_labels2] = my_df['drtype'].map({'Sensitive'     : 0
+                                                 , 'Other'   : 0
+                                                 , 'Pre-MDR' : 1
+                                                 , 'MDR'     : 1
+                                                 , 'Pre-XDR' : 2
+                                                 , 'XDR'     : 2})
+
+target4 = my_df[drtype_labels2]
+
+# sanity checks
+target1.value_counts()
+my_df['mutation_info_labels'].value_counts()
+
+target2.value_counts()
+my_df[drug_labels].value_counts()
+
+target3.value_counts()
+my_df['drtype'].value_counts()
+target4.value_counts()
+my_df['drtype'].value_counts()
+
+#%%
+# GET X
+common_cols_stabilty = ['ligand_distance'
+           , 'ligand_affinity_change'
+           , 'duet_stability_change'
+           , 'ddg_foldx'
+           , 'deepddg'
+           , 'ddg_dynamut2']
+
+# Build stability columns ~ gene
+if gene.lower() in geneL_basic:
+    x_stability_cols = common_cols_stabilty
+    
+if gene.lower() in geneL_ppi2:
+    x_stability_cols = common_cols_stabilty + ['mcsm_ppi2_affinity'
+                                               , 'interface_dist'] 
+if gene.lower() in geneL_na:
+    x_stability_cols = common_cols_stabilty + ['mcsm_na_affinity'] 
+
+if gene.lower() in geneL_na_ppi2:
+    x_stability_cols = common_cols_stabilty + ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] 
+    #D1148 get rid of
+    na_index = my_df['mutationinformation'].index[my_df['mcsm_na_affinity'].apply(np.isnan)]
+    my_df = my_df.drop(index=na_index)
+    
+X_strF =  ['asa'
+           , 'rsa'
+           , 'kd_values'
+           , 'rd_values']    
+
+X_evolF =  ['consurf_score'
+           , 'snap2_score'
+           , 'snap2_accuracy_pc']
+
+# TODO: ADD ED values
+# Problematic due to NA
+# X_genomicF =  ['af'
+#            , 'or_mychisq'
+#            , 'or_logistic'
+#            , 'or_fisher'
+#            , 'pval_fisher']
+
+#%% try combinations
+X_vars1 = my_df[x_stability_cols] 
+X_vars2 = my_df[X_strF] 
+X_vars3 = my_df[X_evolF] 
+#X_vars4 = my_df[X_genomicF] 
+#X_vars4 = X_vars4.fillna('unknown') # need one hot encoder!
+
+X_vars5  = my_df[x_stability_cols + X_strF]
+X_vars6  = my_df[x_stability_cols + X_evolF]
+#X_vars7  = my_df[x_stability_cols + X_genomicF]
+X_vars8  = my_df[X_strF + X_evolF]
+#X_vars9  = my_df[X_strF + X_genomicF]
+#X_vars10 = my_df[X_evolF + X_genomicF]
+X_vars11 = my_df[x_stability_cols + X_strF + X_evolF]
+#X_vars12 = my_df[x_stability_cols + X_strF + X_evolF + X_genomicF]
+
+numerical_features_names = x_stability_cols + X_strF + X_evolF
+
+# separate ones for foldx?
+categorical_features_names = ['ss_class'
+                           , 'wt_prop_water'
+                          # , 'lineage_labels' # misleading if using merged_df3
+                           , 'mut_prop_water'
+                           , 'wt_prop_polarity'
+                           , 'mut_prop_polarity'
+                           , 'wt_calcprop'
+                           , 'mut_calcprop'
+                           , 'active_aa_pos']
+
+numerical_features_df = my_df[numerical_features_names]
+numerical_features_df.shape
+
+categorical_features_df = my_df[categorical_features_names]
+categorical_features_df.shape
+
+all_features_df = my_df[numerical_features_names + categorical_features_names]
+all_features_df.shape
+#%%
+X_vars1.shape[1]
+X_vars5.shape[1]
+# TODO: stratified cross validate
+# Train-test Split
+
+# TARGET1
+X_train, X_test, y_train, y_test = train_test_split(X_vars1, 
+                                                    target1, 
+                                                    test_size = 0.33, 
+                                                    random_state = 42)
+t1_res = MultClassPipeline(X_train, X_test, y_train, y_test)
+t1_res
+# TARGET3
+X_train3, X_test3, y_train3, y_test3 = train_test_split(X_vars5, 
+                                                    target3, 
+                                                    test_size = 0.33, 
+                                                    random_state = 42)
+t3_res = MultClassPipeline(X_train3, X_test3, y_train3, y_test3)
+t3_res
+#%%
--- a/earlier_versions/my_datap7.py
+++ b/earlier_versions/my_datap7.py
@ -0,0 +1,207 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Sat Mar  5 12:57:32 2022
+
+@author: tanu
+"""
+
+import os, sys
+import pandas as pd
+import numpy as np
+from sklearn.linear_model import LogisticRegression
+from sklearn.naive_bayes import BernoulliNB
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.svm import SVC
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
+from sklearn.neural_network import MLPClassifier
+from sklearn.pipeline import Pipeline
+from xgboost import XGBClassifier
+from sklearn.preprocessing import StandardScaler
+from sklearn.preprocessing import MinMaxScaler
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score
+from sklearn.model_selection import cross_validate
+from sklearn.metrics import make_scorer
+from sklearn.metrics import classification_report
+
+from sklearn.feature_selection import RFE
+from sklearn.feature_selection import RFECV
+
+#############################
+# trying feature selection
+#############################
+#%%
+model= Pipeline(steps = [
+    ('pre', MinMaxScaler()),
+    ('reg', LogisticRegression(class_weight = 'balanced'))])
+
+def precision(y_true,y_pred):
+    return precision_score(y_true,y_pred,pos_label = 1)
+def recall(y_true,y_pred):
+    return recall_score(y_true, y_pred, pos_label = 1)
+def f1(y_true,y_pred):
+    return f1_score(y_true, y_pred, pos_label = 1)
+
+acc  = make_scorer(accuracy_score)
+prec = make_scorer(precision)
+rec  = make_scorer(recall)
+f1   = make_scorer(f1)
+
+output = cross_validate(model, X_train, y_train
+                        , scoring = {'acc' : acc
+                                  ,'prec': prec
+                                  ,'rec' : rec
+                                  ,'f1'  : f1}
+                        , cv = 10
+                        , return_train_score = False)
+pd.DataFrame(output).mean()
+
+#%%
+# classification_repor: lowest scores but does it give numbers for all your classes!
+model.fit(X_train, y_train)
+y_pred = model.predict(X_test)
+
+f1_score(y_test, y_pred)
+roc_auc_score (y_test, y_pred) # not sure!
+#roc_curve(y_test, y_pred)
+classification_report(y_test, y_pred)
+target_names = {1:'Resistant', 0:'Sensitive'}
+print(classification_report(y_test
+                            , y_pred
+                            #, target_names=y_test.map(target_names)
+                            ))
+#%%NOT SURE!
+from itertools import combinations
+def train(X):
+    return cross_validate(model, X, y_train
+                          #, scoring = make_scorer(accuracy_score)
+                          , scoring = {'acc' : acc
+                                    ,'prec'  : prec
+                                    ,'rec'   : rec
+                                    ,'f1'    : f1}
+                          , cv = 10
+                          , return_train_score = False)
+                          #, return_estimator = True)['test_score']
+
+scores = [train(X_train.loc[:,vars]) for vars in combinations(X_train.columns, len(X_train.columns))]
+means = [score.mean() for score in scores]
+means
+#%%
+# TO TRY
+https://rasbt.github.io/mlxtend/
+
+# stackoverflow
+# informative post
+https://datascience.stackexchange.com/questions/937/does-scikit-learn-have-a-forward-selection-stepwise-regression-algorithm
+https://datascience.stackexchange.com/questions/24405/how-to-do-stepwise-regression-using-sklearn/24447#24447
+https://stats.stackexchange.com/questions/204141/difference-between-selecting-features-based-on-f-regression-and-based-on-r2
+
+
+# 0.24 version, it supports
+https://scikit-learn.org/stable/auto_examples/release_highlights/plot_release_highlights_0_24_0.html#new-sequentialfeatureselector-transformer
+https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectFromModel.html
+https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SequentialFeatureSelector.html
+https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFECV.html
+https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.f_regression.html
+https://www.scikit-yb.org/en/latest/api/model_selection/rfecv.html
+
+#GridSearchCV
+#ParameterGrid
+#RandomizedSearchCV
+#https://medium.com/analytics-vidhya/hyper-parameter-tuning-gridsearchcv-vs-randomizedsearchcv-499862e3ca5
+#%% RFE: Feature selection in classification
+# others in example
+# https://towardsdatascience.com/feature-selection-techniques-for-classification-and-python-tips-for-their-application-10c0ddd7918b
+# https://towardsdatascience.com/feature-selection-using-python-for-classification-problem-b5f00a1c7028
+
+model_logistic = LogisticRegression(solver='lbfgs'
+                                    , multi_class = 'multinomial'
+                                    , max_iter = 1000)
+model_logistic = LogisticRegression()
+
+sel_rfe_logistic = RFE(estimator = model_logistic
+                       , n_features_to_select = 4
+                       , step = 1)
+
+X_train_rfe_logistic = sel_rfe_logistic.fit_transform(X_train, y_train)
+print(sel_rfe_logistic.get_support())
+
+print(sel_rfe_logistic.ranking_)
+#%% RFECV
+# https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFECV.html
+
+target = target1
+target = target3
+target = target4
+X_train, X_test, y_train, y_test = train_test_split(X_vars1, 
+                                                    target, 
+                                                    test_size = 0.33, 
+                                                    random_state = 42)
+
+X_train, X_test, y_train, y_test = train_test_split(X_vars2, 
+                                                    target, 
+                                                    test_size = 0.33, 
+                                                    random_state = 42)
+
+X_train, X_test, y_train, y_test = train_test_split(X_vars3, 
+                                                    target, 
+                                                    test_size = 0.33, 
+                                                    random_state = 42)
+
+X_train, X_test, y_train, y_test = train_test_split(X_vars5, 
+                                                    target, 
+                                                    test_size = 0.33, 
+                                                    random_state = 42)
+
+X_train, X_test, y_train, y_test = train_test_split(X_vars11, 
+                                                    target, 
+                                                    test_size = 0.33, 
+                                                    random_state = 42)
+
+model_logistic2 = LogisticRegression()
+sel_rfe_logistic = RFECV(estimator = model_logistic2
+                       , cv = 10
+                       , step = 1)
+
+X_train_rfe_logistic = sel_rfe_logistic.fit_transform(X_train, y_train)
+print(sel_rfe_logistic.get_support())
+X_train.columns
+
+print(sel_rfe_logistic.ranking_)
+#%%
+# TODO: imputation
+# Find out the best way to impute values!
+#from sklearn.impute import SimpleImputer
+# https://towardsdatascience.com/whats-the-best-way-to-handle-nan-values-62d50f738fc
+#KNN and MICE
+my_df2 = pd.read_csv(infile_ml1)
+
+genomicF = ['af'
+        , 'beta_logistic'
+        , 'or_logistic'
+        , 'pval_logistic'
+        , 'se_logistic'
+        , 'zval_logistic'
+        , 'ci_low_logistic'
+        , 'ci_hi_logistic'
+        , 'or_mychisq'
+        , 'log10_or_mychisq'
+        , 'or_fisher'
+        , 'pval_fisher'
+        , 'neglog_pval_fisher'
+        , 'ci_low_fisher'
+        , 'ci_hi_fisher'
+        , 'est_chisq'
+        , 'pval_chisq']
+
+# X_genomicF =  ['af'
+#             , 'or_mychisq'
+#             , 'or_logistic'
+#             , 'or_fisher'
+#             , 'pval_fisher']
+
+my_df2[genomicF].isna().sum()
+
+my_df2[genomicF] = my_df2[genomicF].fillna(value='unknown')
--- a/earlier_versions/my_datap8.py
+++ b/earlier_versions/my_datap8.py
@ -0,0 +1,118 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Sat Mar  5 12:57:32 2022
+
+@author: tanu
+"""
+#%%
+# data, etc for now  comes from my_data6.py and/or my_data5.py
+
+#%% try combinations
+#import sys, os
+#os.system("imports.py")
+
+
+#%%
+seed = 42
+features_to_encode = list(X_train.select_dtypes(include = ['object']).columns) 
+
+col_trans = make_column_transformer(
+                        (OneHotEncoder(),features_to_encode),
+                        remainder = "passthrough"
+                        )
+
+rf_classifier = RandomForestClassifier(
+                      min_samples_leaf=50,
+                      n_estimators=150,
+                      bootstrap=True,
+                      oob_score=True,
+                      n_jobs=-1,
+                      random_state=seed,
+                      max_features='auto')
+
+pipe = make_pipeline(col_trans, rf_classifier)
+pipe.fit(X_train, y_train)
+y_pred = pipe.predict(X_test)
+#%%
+
+all_features_df.shape
+
+
+X_train, X_test, y_train, y_test = train_test_split(all_features_df, 
+                                                    target1, 
+                                                    test_size = 0.33, 
+                                                    random_state = 42)
+preprocessor = ColumnTransformer(
+    transformers=[
+        ('num', MinMaxScaler()  , numerical_features_df)
+        ,('cat', OneHotEncoder(), categorical_features_df)])
+
+seed = 42
+rf_classifier = RandomForestClassifier(
+                      min_samples_leaf=50,
+                      n_estimators=150,
+                      bootstrap=True,
+                      oob_score=True,
+                      n_jobs=-1,
+                      random_state=seed,
+                      max_features='auto')
+
+preprocessor.fit(all_features_df)
+preprocessor.transform(all_features_df)
+
+model = Pipeline(steps = [
+    ('preprocess', preprocessor)
+    ,('regression',linear_model.LogisticRegression())
+    ])
+
+model.fit(X_train, y_train)
+y_pred = model.predict(X_test)
+y_pred
+
+
+def precision(y_true,y_pred):
+    return precision_score(y_true,y_pred,pos_label = 1)
+def recall(y_true,y_pred):
+    return recall_score(y_true, y_pred, pos_label = 1)
+def f1(y_true,y_pred):
+    return f1_score(y_true, y_pred, pos_label = 1)
+
+acc  = make_scorer(accuracy_score)
+prec = make_scorer(precision)
+rec  = make_scorer(recall)
+f1   = make_scorer(f1)
+
+output = cross_validate(model, X_train, y_train
+                        , scoring = {'acc' : acc
+                                  ,'prec': prec
+                                  ,'rec' : rec
+                                  ,'f1'  : f1}
+                        , cv = 10
+                        , return_train_score = False)
+pd.DataFrame(output).mean()
+#%% with feature selection
+preprocessor.fit(numerical_features_df)
+preprocessor.transform(numerical_features_df)
+
+model = Pipeline(steps = [
+    ('preprocess', preprocessor)
+    ,('regression',linear_model.LogisticRegression())
+    ])
+
+
+
+selector_logistic = RFECV(estimator = model
+                       , cv = 10
+                       , step = 1)
+
+X_trainN, X_testN, y_trainN, y_testN = train_test_split(numerical_features_df 
+                                                    , target1 
+                                                    , test_size = 0.33
+                                                    , random_state = 42)
+
+selector_logistic_xtrain = selector_logistic.fit_transform(X_trainN, y_trainN)
+print(sel_rfe_logistic.get_support())
+X_trainN.columns
+
+print(sel_rfe_logistic.ranking_)
--- a/earlier_versions/my_datap9.py
+++ b/earlier_versions/my_datap9.py
@ -0,0 +1,144 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Sat Mar  5 12:57:32 2022
+
+@author: tanu
+"""
+#%%
+# data, etc for now  comes from my_data6.py and/or my_data5.py
+#%% try combinations
+#import sys, os
+#os.system("imports.py")
+def precision(y_true,y_pred):
+    return precision_score(y_true,y_pred,pos_label = 1)
+def recall(y_true,y_pred):
+    return recall_score(y_true, y_pred, pos_label = 1)
+def f1(y_true,y_pred):
+    return f1_score(y_true, y_pred, pos_label = 1)
+
+#%%
+numerical_features_df.shape
+categorical_features_df.shape
+all_features_df.shape
+#%%
+target = target1
+#target = target3
+X_trainN, X_testN, y_trainN, y_testN = train_test_split(numerical_features_df, 
+                                                    target, 
+                                                    test_size = 0.33, 
+                                                    random_state = 42)
+
+X_trainC, X_testC, y_trainC, y_testC = train_test_split(categorical_features_df, 
+                                                    target, 
+                                                    test_size = 0.33, 
+                                                    random_state = 42)
+
+X_train, X_test, y_train, y_test = train_test_split(all_features_df, 
+                                                    target, 
+                                                    test_size = 0.33, 
+                                                    random_state = 42)
+#%%
+
+#%%
+preprocessor = ColumnTransformer(
+    transformers=[
+        ('num', MinMaxScaler()  , numerical_features_names)
+        ,('cat', OneHotEncoder(), categorical_features_names)
+        ], remainder = 'passthrough')
+
+f = preprocessor.fit(numerical_features_df)
+f2 = preprocessor.transform(numerical_features_df)
+
+f3 = preprocessor.fit_transform(numerical_features_df)
+(f3==f2).all()
+
+preprocessor.fit_transform(numerical_features_df)
+
+#preprocessor.fit_transform(all_features_df)
+
+#%%
+model_log = Pipeline(steps = [
+    ('preprocess', preprocessor)
+    #,('log_reg', linear_model.LogisticRegression())
+    ,('log_reg', LogisticRegression(
+        class_weight = 'unbalanced'))
+    ])
+model = model_log
+#%%
+seed = 42
+model_rf = Pipeline(steps = [
+    ('preprocess', preprocessor)
+    ,('rf',  RandomForestClassifier(
+                          min_samples_leaf=50,
+                          n_estimators=150,
+                          bootstrap=True,
+                          oob_score=True,
+                          n_jobs=-1,
+                          random_state=seed,
+                          max_features='auto'))
+    ])
+model = model_rf
+#%%
+model.fit(X_trainN, y_trainN)
+y_pred = model.predict(X_testN)
+y_pred
+
+acc  = make_scorer(accuracy_score)
+prec = make_scorer(precision)
+rec  = make_scorer(recall)
+f1   = make_scorer(f1)
+
+output = cross_validate(model, X_trainN, y_trainN
+                        , scoring = {'acc' : acc
+                                  ,'prec': prec
+                                  ,'rec' : rec
+                                  ,'f1'  : f1}
+                        , cv = 10
+                        , return_train_score = False)
+pd.DataFrame(output).mean()
+#%% Run multiple models using MultClassPipeline
+# only good for numerical features as categ features is not supported yet!
+t1_res = MultClassPipeline2(X_trainN, X_testN, y_trainN, y_testN, input_df = all_features_df)
+t1_res
+
+#%%
+# https://machinelearningmastery.com/columntransformer-for-numerical-and-categorical-data/
+#Each transformer is a three-element tuple that defines the name of the transformer, the transform to apply, and the column indices to apply it to. For example:
+# (Name, Object, Columns)
+
+# Determine categorical and numerical features
+numerical_ix = all_features_df.select_dtypes(include=['int64', 'float64']).columns
+numerical_ix
+categorical_ix = all_features_df.select_dtypes(include=['object', 'bool']).columns
+categorical_ix
+
+# Define the data preparation for the columns
+t = [('cat', OneHotEncoder(), categorical_ix)
+     , ('num', MinMaxScaler(), numerical_ix)]
+col_transform = ColumnTransformer(transformers=t
+                                  , remainder='passthrough')
+# create pipeline (unlike example above where the col transfer was a preprocess step and it was fit_transformed)             
+
+pipeline = Pipeline(steps=[('prep', col_transform)
+                                   , ('classifier', LogisticRegression())])
+#%% Added this to the MultClassPipeline
+
+tN_res = MultClassPipeline(X_trainN, X_testN, y_trainN, y_testN)
+tN_res
+
+t2_res = MultClassPipeline2(X_train, X_test, y_train, y_test, input_df = all_features_df)
+t2_res
+
+t3_res = MultClassPipeSKF(input_df = numerical_features_df
+                          , y_targetF = target1
+                          , var_type = 'numerical'
+                          , skf_splits = 10)
+t3_res
+
+
+t4_res = MultClassPipeSKF(input_df = all_features_df
+                          , y_targetF = target1
+                          , var_type = 'mixed'
+                          , skf_splits = 10)
+t4_res
--- a/earlier_versions/p_jr_d1.py
+++ b/earlier_versions/p_jr_d1.py
@ -0,0 +1,405 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Feb 17 14:52:55 2022
+
+@author: tanu
+"""
+from sklearn.datasets import load_boston
+from sklearn import linear_model
+from sklearn import preprocessing
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+import numpy as np
+print(np.__version__)
+print(pd.__version__)
+
+boston = load_boston()
+dir(boston)
+#['DESCR', 'data', 'data_module', 'feature_names', 'filename', 'target']
+X, y = boston.data, boston.target
+df = pd.DataFrame(X, columns = boston.feature_names)
+df['MEDV'] = y
+
+sns.scatterplot(x = 'CRIM', y = 'MEDV', data = df)
+plt.show()
+
+#Model fitting
+#To fit a model using just a single predictor we first extract the training variables.
+X_train = df['CRIM']
+y_train = y
+# Unfortunately, sklearn ’s various model fitting functions typically expect a
+# two dimensional array for the covariates. Since we have extracted only
+# a single feature here it is only one dimensional. We need to reshape the
+# X_train values to be the appropriate shape.
+# This is not necessary if using more than a single feature.
+
+if len(X_train.values.shape) == 1:
+    X_train = X_train.values.reshape(-1, 1)
+
+# Create a LinearRegression object: This object is of a broader class of estima-
+#tor objects.
+model = linear_model.LinearRegression()
+model.fit(X_train, y_train)
+
+# We can make predictions from our fitted model with the .predict() method.
+new_value = np.array(4.09, ndmin = 2)
+model.predict(new_value)
+multiple_values = np.array([1, 2, 3], ndmin = 2).T
+model.predict(multiple_values)
+
+#Fitted values
+#Fitted values of a model typically describes the predicted ŷ for the obser-
+#vations X . To get the model fitted values we could just predict from the
+#model using the values used to train it.
+fitted = model.predict(X_train)
+ax = sns.scatterplot(x = 'CRIM', y = 'MEDV', data = df)
+sns.lineplot(df['CRIM'], fitted, ax = ax)
+plt.show()
+
+# Interpreting the coefficients
+# The coefficients of the fitted model are kept in the model.coef_ attribute.
+# This gives us the expected change in y for a unit change in X .
+model.coef_
+
+
+#2.3 Multiple linear regression
+X_train = df.iloc[:,:3]
+grid = sns.PairGrid(data=pd.concat([X_train,pd.Series(y_train,name="MEDV")],axis = 1))
+grid.map_offdiag(sns.scatterplot)
+grid.map_diag(sns.distplot)
+plt.show()
+model.fit(X_train, y_train)
+new_values = np.array(X_train.mean(), ndmin = 2)
+model.predict(new_values)
+#Residuals
+#In classical statistics, one of our assump-
+#tions it that the residuals are normally dis-
+#tributed.Small RSS implies the fitted model is
+#closer to the observations.
+
+fitted = model.predict(X_train)
+resid = y_train - fitted
+# Standardise to remove effect of measurement scale
+resid = (resid - np.mean(resid))/np.std(resid,ddof = 1)
+plt.figure()
+for i in range(3):
+    xvar = X_train.iloc[:,i]
+    ax = plt.subplot(3, 1, i + 1)
+    ax.scatter(xvar, resid)
+    ax.set_xlabel(boston.feature_names[i])
+    ax.set_ylabel("Residuals")
+    ax.hlines([-2, 0, 2], np.min(xvar), np.max(xvar))
+    
+plt.show()
+plt.figure()
+ax = plt.subplot(3, 1, 1)
+ax.scatter(fitted,resid)
+ax.set_xlabel('Fitted values')
+ax.set_ylabel('Residuals')
+
+ax = plt.subplot(3,1,2)
+ax.scatter(fitted,y_train)
+ax.set_xlabel('Fitted values')
+ax.set_ylabel('Predicted values')
+
+ax = plt.subplot(3, 1,3)
+import scipy.stats as stats
+stats.probplot(resid,dist = 'norm',plot = ax)
+plt.show()
+
+#Scaling data: many types available
+# sklearn comes with many preprocessing transformations in the sklearn.preprocessing module
+#Scaling is crucial for many statistical and machine learning algorithms
+# • k-means and hierarchical clustering
+# – Data units & variance play crucial role in cluster selection
+# • Using gradient descent optimization
+# – Scaled data allows the weights to update at an equal speed
+# • Scaled data allows the regression coefficients to be compared
+
+#########################################################
+# Min-max scaling
+# DOESN'T change the shape
+# DOES change the bounds, mean and sd
+# NOT often used in LR
+# used more in GDO (gradient Descent Optimisation)
+
+# sklearn.preprocessing module has a MinMaxScaler() for this
+##########################################################
+
+np.random.seed(1)
+x_n = np.random.normal(2, 5, 500) 
+x_t = np.random.standard_t(2, 500) 
+x_ln = np.random.lognormal(1, 1, 500) 
+df = pd.DataFrame({ 'Normal': x_n, 'T': x_t, 'Lognormal': x_ln
+})
+
+df_long = df.melt(var_name='Distribution')
+g = sns.FacetGrid(df_long, col='Distribution',sharex=False)
+g.map(plt.hist, 'value', bins = 50)
+plt.show()
+
+def min_max(x):
+    min = np.min(x)
+    s = (x - min)/(np.max(x) - min)
+    return (s)
+
+scaled = df.apply(min_max).melt(var_name='Distribution')
+
+scaled['Scaled'] = True
+df_long['Scaled'] = False
+full_data = pd.concat([df_long, scaled], axis=0)
+
+g = sns.FacetGrid(full_data, col='Distribution'
+                  ,row='Scaled'
+                  , sharex=False
+                  , sharey=False)
+
+g.map(plt.hist, 'value', bins = 50)
+
+plt.show()
+
+df.apply([np.mean,np.std])
+df.apply(min_max).apply([np.mean,np.std])
+
+# sklearn: MinMaxScaler()
+
+scaler = preprocessing.MinMaxScaler()
+scaler.fit(X_train)
+
+X_train_scaled = scaler.transform(X_train)
+X_train_scaled[:1]
+
+##########################################################
+# z-score standardisation
+# DOESN'T change the shape
+# popular in linear models
+# DOESN'T effect the predictions
+# but makes the size of the coeffs directly comparable
+
+# sklearn.preprocessing module has a StandardScaler() for this
+##########################################################
+
+def z_score(x):
+    mean = np.mean(x)
+    std = np.std(x, ddof=1)
+    return (x - mean)/std
+
+scaled = df.apply(z_score).melt(var_name='Distribution')
+scaled['Scaled'] = True
+full_data = pd.concat([df_long, scaled], axis=0)
+g = sns.FacetGrid(full_data, col='Distribution'
+                  , row ='Scaled'
+                  , sharex=False
+                  ,sharey=False)
+g.map(plt.hist, 'value', bins=50)
+
+###############################################
+# Dividing by two standard deviations
+# http://www.stat.columbia.edu/
+# ~gelman/research/published/ standardizing7.pdf
+# One of the downsides of scaling data by z-scoring is that is not obvious
+# how this should be handled in the case of categorical variables.
+
+# suggest the use of a rescaling that divides numeric vari-
+# ables by two standard deviations, whilst leaving binary encoded categorical
+# variables untransformed.
+# nothing in sklearn for this
+###############################################
+from sklearn.base import BaseEstimator, TransformerMixin
+class two_sd_scaler(BaseEstimator, TransformerMixin):
+    def fit(self, X, y=None):
+        self.stds = 2*np.std(X, axis=0, ddof=1)
+        return self
+    def transform(self, X, y=None):
+        return X/self.stds
+        
+# Having preprocessed the data this way we can not fit a model to it in the
+# same way as before.
+model2 = linear_model.LinearRegression()
+model2.fit(X_train_scaled, y_train)
+#When making predictions on new values we also need to make sure to pass
+#the new values through the same preprocessing step.
+
+new_value = np.array(X_train.mean(), ndmin = 2)
+new_scaled = scaler.transform(new_value)
+pred = model2.predict(new_scaled)
+pred
+
+##########################
+# 2.5 Creating a pipeline
+##########################
+# For any training data set and any data for prediction we will want to apply
+# the same scaling transformation and use the same model. We could create
+# a sklearn.pipeline.Pipeline() to organise the steps to creating the
+# estimator
+
+from sklearn.pipeline import Pipeline
+model = Pipeline(steps = [('preprocess', preprocessing.StandardScaler())
+                          ,('regression', linear_model.LinearRegression())
+                          ])
+
+# Having created the Pipeline object we can now fit as before. Calling
+# .fit() now however, will first fit the 'preprocess' step and then the
+# 'regression' step. When we predict, the new values will also pass through
+# both stages of our pipeline.
+model.fit(X_train,y_train)
+new_values = np.array(X_train.mean(), ndmin = 2)
+model.predict(new_values)
+#from sklearn.metrics import accuracy_score
+#print(accuracy_score(y_test, model.predict(X_test)))
+
+#2.6 Preprocessing categorical variables
+# One hot encoding: will take a categorical feature with K categories and
+# create a ‘one of K ’ encoding scheme. I.e a set of binary variables for each
+# category. Consider the toy data
+
+toy = pd.DataFrame({
+'category':['a', 'a', 'b', 'c', 'b']
+})
+enc = preprocessing.OneHotEncoder()
+enc.fit(toy)
+enc.transform(toy).toarray()
+
+#Combining preprocessing steps: 
+# the preprocessing steps into a single operation
+# for our Pipeline using a sklearn.compose.ColumnTransformer
+toy = pd.DataFrame({
+'numeric': [1., 2., 3., 4., 5.],
+'category': ['a', 'a', 'b', 'c', 'b']
+})
+
+from sklearn.compose import ColumnTransformer
+from sklearn.preprocessing import StandardScaler, OneHotEncoder
+numeric_features = ['numeric']
+categorical_features = ['category']
+preprocessor = ColumnTransformer(transformers=[('num', StandardScaler()
+                                                , numeric_features)
+                                               ,('cat', OneHotEncoder(), categorical_features)])
+
+preprocessor.fit(toy)
+preprocessor.transform(toy)
+
+# This preprocessing step could then be a step in the pipeline for a regres-
+# sion 
+model = Pipeline(steps = [('preprocess', preprocessor)
+                          ,('regression', linear_model.LinearRegression())])
+
+# fit the preprocessor pipeline to the data
+preprocessor.fit(toy)
+
+# transformer will now give the appropriate pre-processing for different types of variables.
+preprocessor.transform(toy)
+
+#This preprocessing step could then be a step in the pipeline for a regression
+model = Pipeline(steps = [('preprocess', preprocessor)
+         ,('regression', linear_model.LinearRegression())])
+
+#Model Assessment and Feature Selection
+
+#%%#####################################################################
+
+# Accuracy score is only for classification problems.
+# For regression problems you can use: R2 Score, MSE (Mean Squared Error), RMSE (Root Mean Squared Error).
+from sklearn import datasets
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler
+from sklearn.decomposition import PCA
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.pipeline import Pipeline
+from sklearn import preprocessing
+
+# read data
+iris = datasets.load_iris()
+
+# assign X and y
+X = iris.data
+y = iris.target
+
+# split data into train and testing part (25 % of data is test size of the data)
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)
+
+# preprocess the data 
+
+# scaling
+scaler = preprocessing.MinMaxScaler()
+# fit X_train to scaling
+scaler.fit(X_train)
+# Apply the scaling/transforamtion to the dta
+X_train_scaled = scaler.transform(X_train)
+
+# Choose the required model/s
+model2 = linear_model.LinearRegression() # Classification metrics can't handle a mix of multiclass and continuous targets
+model2 = DecisionTreeClassifier()
+
+# fit the model to the data for predictions
+model2.fit(X_train_scaled, y_train)
+# check model performace
+print(accuracy_score(y_test, model2.predict(X_test)))
+
+
+#When making predictions on new values we also need to make sure to pass
+#the new values through the same preprocessing step.
+new_value = np.array(X_train.mean(), ndmin = 2)
+new_scaled = scaler.transform(new_value)
+pred = model2.predict(new_scaled)
+pred
+
+# or Create a pipeline that standardizes the data then creates a model
+
+# make a pipeline
+# PCA(Dimension reduction to two) -> Scaling the data -> DecisionTreeClassification
+#https://www.geeksforgeeks.org/pipelines-python-and-scikit-learn/
+pipe1 = Pipeline([('pca', PCA(n_components = 2))
+                 , ('std', StandardScaler())
+                 , ('decision_tree', DecisionTreeClassifier())]
+                 , verbose = True)
+
+pipe2 = Pipeline(steps = [('preprocess', preprocessing.StandardScaler())
+                          #,('regression', linear_model.LinearRegression())
+                          ,('rf', RandomForestClassifier())
+                          ])
+
+# fit pipeline to TRAINING data [X_train and y_train]
+pipe1.fit(X_train, y_train)
+pipe2.fit(X_train, y_train)
+
+# model prediction on TEST data [X_test and y_test]
+print(accuracy_score(y_test, pipe1.predict(X_test)))
+print(accuracy_score(y_test, pipe2.predict(X_test)))
+print(pipe2.classification_report (y_test, np.argmax(predicted, axis = 1))) 
+enc = preprocessing.OneHotEncoder()
+enc.fit(X_train)
+enc.transform(X_train).toarray()
+#%%
+from sklearn.metrics import mean_squared_error, make_scorer
+from sklearn.model_selection import cross_validate
+from sklearn.linear_model import LinearRegression
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
+from sklearn.preprocessing import MinMaxScaler
+boston = load_boston()
+
+X_train, y_train = pd.DataFrame(boston.data, columns = boston.feature_names), boston.target
+
+model1 = Pipeline(steps = [
+    ('pre', MinMaxScaler()),
+    ('reg', LinearRegression())])
+
+score_fn = make_scorer(mean_squared_error)
+scores = cross_validate(model1, X_train, y_train
+                        , scoring = score_fn
+                        , cv = 10)
+
+from itertools import combinations
+def train(X):
+    return cross_validate(model1, X, y_train
+                          , scoring = score_fn
+                          #, return_train_score = False)
+                          , return_estimator = True)['test_score']
+
+scores = [train(X_train.loc[:,vars]) for vars in combinations(X_train.columns, 12)]
+means = [score.mean() for score in scores]
+means
--- a/earlier_versions/p_jr_d2.py
+++ b/earlier_versions/p_jr_d2.py
@ -0,0 +1,7 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Feb 23 11:13:45 2022
+
+@author: tanu
+"""
--- a/earlier_versions/pnca_results_v1.py
+++ b/earlier_versions/pnca_results_v1.py
@ -0,0 +1,99 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Mar  7 15:20:42 2022
+
+@author: tanu
+"""
+fit_time      0.008588
+score_time    0.004460
+test_acc      0.690148
+test_prec     0.690868
+test_rec      0.771250
+test_f1       0.725441
+
+# RF
+fit_time      0.368793
+score_time    0.110153
+test_acc      0.672537
+test_prec     0.664875
+test_rec      0.790417
+test_f1       0.720224
+dtype: float64
+#%%
+numerical_features: ['ligand_distance', 'ligand_affinity_change'
+ , 'duet_stability_change', 'ddg_foldx', 'deepddg', 'ddg_dynamut2'
+ , 'asa', 'rsa', 'kd_values', 'rd_values'
+ , 'consurf_score', 'snap2_score', 'snap2_accuracy_pc']
+
+                  Model  F1_Score  Precision    Recall  Accuracy   ROC_AUC
+ 0  Logistic Regression  0.734177   0.690476  0.783784  0.700000  0.694922
+ 1          Naive Bayes  0.467290   0.757576  0.337838  0.592857  0.608313
+ 2  K-Nearest Neighbors  0.773006   0.707865  0.851351  0.735714  0.728706
+ 3                  SVM  0.766467   0.688172  0.864865  0.721429  0.712735
+ 4                  MLP  0.725000   0.674419  0.783784  0.685714  0.679771
+ 5        Decision Tree  0.662069   0.676056  0.648649  0.650000  0.650082
+ 6          Extra Trees  0.748387   0.716049  0.783784  0.721429  0.717649
+ 7        Random Forest  0.722581   0.691358  0.756757  0.692857  0.688984
+ 8       Random Forest2  0.731707   0.666667  0.810811  0.685714  0.678133
+ 9              XGBoost  0.692810   0.670886  0.716216  0.664286  0.661138)
+    
+all_features: numerical_features + ['ss_class', 'wt_prop_water', 'mut_prop_water', 'wt_prop_polarity',
+       'mut_prop_polarity', 'wt_calcprop', 'mut_calcprop', 'active_aa_pos']
+
+                  Model  F1_Score  Precision    Recall  Accuracy   ROC_AUC
+ 0  Logistic Regression  0.757764   0.701149  0.824324  0.721429  0.715192
+ 1          Naive Bayes  0.620690   0.633803  0.608108  0.607143  0.607084
+ 2  K-Nearest Neighbors  0.619355   0.592593  0.648649  0.578571  0.574324
+ 3                  SVM  0.766467   0.688172  0.864865  0.721429  0.712735
+ 4                  MLP  0.738854   0.698795  0.783784  0.707143  0.702498
+ 5        Decision Tree  0.666667   0.701493  0.635135  0.664286  0.666052
+ 6          Extra Trees  0.728395   0.670455  0.797297  0.685714  0.678952
+ 7        Random Forest  0.763636   0.692308  0.851351  0.721429  0.713554
+ 8       Random Forest2  0.746988   0.673913  0.837838  0.700000  0.691646
+ 9              XGBoost  0.710526   0.692308  0.729730  0.685714  0.683047)
+#%%
+                  Model     F1_Score   Precision Recall    Accuracy  ROC_AUC
+ 0Num  Logistic Regression  0.734177   0.690476  0.783784  0.700000  0.694922
+ 0All  Logistic Regression  0.757764   0.701149  0.824324  0.721429  0.715192
+ 
+ 1Num          Naive Bayes  0.467290   0.757576  0.337838  0.592857  0.608313
+ 1All          Naive Bayes  0.620690   0.633803  0.608108  0.607143  0.607084
+
+ 2Num  K-Nearest Neighbors  0.773006   0.707865  0.851351  0.735714  0.728706 ** 'Num' is better than 'All'
+ 2All  K-Nearest Neighbors  0.619355   0.592593  0.648649  0.578571  0.574324 
+
+ 3Num                 SVM  0.766467   0.688172  0.864865  0.721429  0.712735
+ 3All                 SVM  0.766467   0.688172  0.864865  0.721429  0.712735
+
+ 4Num                  MLP  0.725000   0.674419  0.783784  0.685714  0.679771
+ 4All                  MLP  0.738854   0.698795  0.783784  0.707143  0.702498
+
+ 5Num        Decision Tree  0.662069   0.676056  0.648649  0.650000  0.650082 ** marginal, equivalent
+ 5All        Decision Tree  0.666667   0.701493  0.635135  0.664286  0.666052
+
+ 6Num          Extra Trees  0.748387   0.716049  0.783784  0.721429  0.717649 ** marginal, equivalent
+ 6All          Extra Trees  0.728395   0.670455  0.797297  0.685714  0.678952
+
+ 7Num        Random Forest  0.722581   0.691358  0.756757  0.692857  0.688984
+ 7All        Random Forest  0.763636   0.692308  0.851351  0.721429  0.713554
+
+ 8Num       Random Forest2  0.731707   0.666667  0.810811  0.685714  0.678133
+ 8All       Random Forest2  0.746988   0.673913  0.837838  0.700000  0.691646
+ 
+ 9Num              XGBoost  0.692810   0.670886  0.716216  0.664286  0.661138)
+ 9All              XGBoost  0.710526   0.692308  0.729730  0.685714  0.683047)
+   
+
+#%%
+                  Model  F1_Score  Precision    Recall  Accuracy   ROC_AUC
+ 0  Logistic Regression  0.757764   0.701149  0.824324  0.721429  0.715192
+ 1          Naive Bayes  0.628571   0.666667  0.594595  0.628571  0.630631
+ 2  K-Nearest Neighbors  0.666667   0.623529  0.716216  0.621429  0.615684
+ 3                  SVM  0.766467   0.688172  0.864865  0.721429  0.712735
+ 4                  MLP  0.726115   0.686747  0.770270  0.692857  0.688165
+ 5        Decision Tree  0.647482   0.692308  0.608108  0.650000  0.652539
+ 6          Extra Trees  0.760736   0.696629  0.837838  0.721429  0.714373
+ 7        Random Forest  0.736196   0.674157  0.810811  0.692857  0.685708
+ 8       Random Forest2  0.736196   0.674157  0.810811  0.692857  0.685708
+ 9              XGBoost  0.710526   0.692308  0.729730  0.685714  0.683047)
--- a/earlier_versions/practice_cv.py
+++ b/earlier_versions/practice_cv.py
--- a/earlier_versions/practice_d1.py
+++ b/earlier_versions/practice_d1.py
@ -0,0 +1,69 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Feb 21 13:06:25 2022
+
+@author: tanu
+"""
+X_train
+scaler = preprocessing.MinMaxScaler()
+scaler.fit(X_train)
+
+x_train_scaled = scaler.transform(X_train)
+x_train_scaled
+
+
+foo = scaler.fit(X_train)
+
+x_train_scaled2  = foo.transform(X_train)
+x_train_scaled2
+
+(x_train_scaled == x_train_scaled2).all()
+
+
+toy = pd.DataFrame({
+'numeric': [1., 2., 3., 4., 5.],
+'category': ['a', 'a', 'b', 'c', 'b']
+})
+numeric_features = ['numeric']
+categorical_features = ['category']
+preprocessor = ColumnTransformer(transformers=[('num', StandardScaler(), numeric_features),
+('cat', OneHotEncoder(), categorical_features)
+])
+preprocessor.fit(toy)
+bar = preprocessor.transform(toy)
+bar
+#############
+toy2 = pd.DataFrame({
+'numeric': [1., 2., 3., 4., 5.],
+'numeric2': [1., 2., 3., 4., 6.],
+'category': ['a', 'a', 'b', 'c', 'b'],
+'category2': ['b', 'a', 'b', 'e', 'f']
+})
+numeric_features = ['numeric', 'numeric2']
+categorical_features = ['category', 'category2']
+preprocessor = ColumnTransformer(transformers=[
+('num', StandardScaler(), numeric_features),
+('cat', OneHotEncoder(), categorical_features)
+])
+
+preprocessor.fit(toy2)
+bar2 = preprocessor.transform(toy2)
+bar2
+
+####
+import pandas as pd
+from pandas import DataFrame
+import numpy as np
+
+from sklearn.decomposition import PCA
+from pandas import DataFrame
+pca = PCA(n_components = 2)
+pca.fit(toy2.iloc[:, 0:2])
+
+columns = ['pca_%i' % i for i in range(2)]
+df_pca = DataFrame(pca.transform(toy2.iloc[:, 0:2])
+                   , columns=columns
+                   , index=toy2.index)
+df_pca.head()
+
--- a/earlier_versions/skf_mm.py
+++ b/earlier_versions/skf_mm.py
@ -0,0 +1,161 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Mar 10 10:33:15 2022
+
+@author: tanu
+"""
+#%% Stratified KFold: Multiple_models: 
+input_df = numerical_features_df
+#X_array = np.array(input_df)
+var_type = 'numerical'
+
+input_df = all_features_df
+#X_array = np.array(input_df)
+var_type = 'mixed'
+
+input_df = categorical_features_df
+#X_array = np.array(input_df)
+var_type = 'categorical'    
+
+targetF = target1
+#==============================================================================
+numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
+numerical_ix
+
+categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns
+categorical_ix    
+# Determine preprocessing steps ~ var_type
+if var_type == 'numerical':
+    t = [('num', MinMaxScaler(), numerical_ix)]
+
+if var_type == 'categorical':
+    t = [('cat', OneHotEncoder(), categorical_ix)]
+
+if var_type == 'mixed':
+    t = [('cat', OneHotEncoder(), categorical_ix)
+         , ('num', MinMaxScaler(), numerical_ix)]
+
+###############################################################################  
+col_transform = ColumnTransformer(transformers = t
+                                   , remainder='passthrough')
+
+###############################################################################
+rs = {'random_state': 42}
+
+#log_reg = LogisticRegression(**rs)
+log_reg = LogisticRegression(class_weight = 'balanced')
+nb = BernoulliNB()
+rf = RandomForestClassifier(**rs)
+
+clfs = [('Logistic Regression', log_reg)
+        ,('Naive Bayes'       , nb)
+        , ('Random Forest'    , rf) 
+        ]
+
+#seed_skf = 42
+skf = StratifiedKFold(n_splits = 10
+                      , shuffle = True
+                      #, random_state = seed_skf
+                      , **rs)
+#scores_df  = pd.DataFrame()
+fscoreL      = []
+mccL         = []
+presL        = []
+recallL      = []
+accuL        = []
+roc_aucL     = []
+
+# X_array = np.array(input_df)
+# Y = np.array(target1)
+# Y = target1
+
+for train_index, test_index in skf.split(input_df, targetF):
+    print('\nSKF train index:', train_index
+          , '\nSKF test index:', test_index)
+    x_train_fold, x_test_fold = input_df.iloc[train_index], input_df.iloc[test_index]
+    y_train_fold, y_test_fold = targetF.iloc[train_index], targetF.iloc[test_index]
+# for train_index, test_index in skf.split(X_array, Y):
+#      print('\nSKF train index:', train_index
+#            , '\nSKF test index:', test_index)
+    # x_train_fold, x_test_fold = X_array[train_index], X_array[test_index]
+    # y_train_fold, y_test_fold = Y[train_index], Y[test_index]
+
+
+    clf_scores_df = pd.DataFrame()
+    for clf_name, clf in clfs:   
+        # model2 = Pipeline(steps=[('preprocess', MinMaxScaler())
+        #                            , ('classifier', clf)])
+        model2 = Pipeline(steps=[('preprocess', col_transform)
+                                    , ('classifier', clf)])
+    
+        model2.fit(x_train_fold, y_train_fold)
+        y_pred_fold  = model2.predict(x_test_fold)
+     
+        #----------------
+        # Model metrics
+        #----------------     
+        # F1-Score
+        fscore = f1_score(y_test_fold, y_pred_fold)
+        fscoreL.append(fscore)
+#        print('fscoreL Len: ', len(fscoreL))
+        fscoreM = mean(fscoreL)
+        
+        # Matthews correlation coefficient
+        mcc = matthews_corrcoef(y_test_fold, y_pred_fold)
+        mccL.append(mcc)
+        mccM = mean(mccL)
+        
+        # Precision
+        pres = precision_score(y_test_fold, y_pred_fold)
+        presL.append(pres)
+        presM = mean(presL)
+        
+        # Recall
+        recall = recall_score(y_test_fold, y_pred_fold)
+        recallL.append(recall)
+        recallM = mean(recallL)            
+       
+        # Accuracy
+        accu = accuracy_score(y_test_fold, y_pred_fold)
+        accuL.append(accu)            
+        accuM = mean(accuL)
+        
+        # ROC_AUC
+        roc_auc = roc_auc_score(y_test_fold, y_pred_fold)
+        roc_aucL.append(roc_auc)            
+        roc_aucM = mean(roc_aucL)    
+            
+        clf_scores_df = clf_scores_df.append({'Model': clf_name 
+                                              ,'F1_score'  : fscoreM
+                                              , 'MCC'      : mccM
+                                              , 'Precision': presM
+                                              , 'Recall'   : recallM
+                                              , 'Accuracy' : accuM
+                                              , 'ROC_curve': roc_aucM}
+                                             , ignore_index = True)
+    #scores_df = scores_df.append(clf_scores_df)
+                        
+    
+#%% Call functions
+
+tN_res = MultClassPipeline(X_trainN, X_testN, y_trainN, y_testN)
+tN_res
+
+t2_res = MultClassPipeline2(X_train, X_test, y_train, y_test, input_df = all_features_df)
+t2_res
+
+#CHECK: numbers are awfully close to each other!
+
+t3_res = MultClassPipeSKF(input_df = numerical_features_df
+                          , y_targetF = target1
+                          , var_type = 'numerical'
+                          , skf_splits = 10)
+t3_res
+
+#CHECK: numbers are awfully close to each other!
+t4_res = MultClassPipeSKF(input_df = all_features_df
+                          , y_targetF = target1
+                          , var_type = 'mixed'
+                          , skf_splits = 10)
+t4_res    
--- a/earlier_versions/testing_lazypredict_p1.py
+++ b/earlier_versions/testing_lazypredict_p1.py
@ -0,0 +1,39 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Mar 14 10:46:44 2022
+
+@author: tanu
+"""
+# Link: https://laptrinhx.com/how-to-run-30-machine-learning-models-with-2-lines-of-code-1521663246/
+import pyforest
+import warnings
+warnings.filterwarnings("ignore")
+from sklearn import metrics
+from sklearn.metrics import accuracy_score
+import lazypredict
+from lazypredict.Supervised import LazyClassifier
+
+#%%
+target = target1
+#target = target3
+X_trainN, X_testN, y_trainN, y_testN = train_test_split(numerical_features_df, 
+                                                    target, 
+                                                    test_size = 0.33, 
+                                                    random_state = 42)
+
+
+#%%
+clf = LazyClassifier(verbose=0,ignore_warnings=True)
+modelsN, predictionsN = clf.fit(X_trainN, X_testN, y_trainN, y_testN)
+mm_lpN = modelsN
+
+#%%
+# DOESN't work as need to incorporate pipeline(one hot encoder)
+models, predictions = clf.fit(X_train, X_test, y_train, y_test)
+mm_lp = models
+
+model1 = Pipeline(steps = [('preprocess', MinMaxScaler())
+                                , ('multiModels', clf) ])
+
+models, predictions = model1.fit(X_trainN, X_testN, y_trainN, y_testN)
--- a/hyperparams.py
+++ b/hyperparams.py
--- a/hyperparams_p1.py
+++ b/hyperparams_p1.py