renamed hyperparams to gscv

2022-03-22 11:08:20 +00:00 · 2022-03-22 11:08:20 +00:00 · ad5ebad7f8
commit ad5ebad7f8
parent a82358dbb4
31 changed files with 4433 additions and 0 deletions
--- a/pycache/lazypredict.cpython-37.pyc
+++ b/pycache/lazypredict.cpython-37.pyc
--- a/classification_names_params.py
+++ b/classification_names_params.py
@ -0,0 +1,264 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Fri Mar 18 09:47:48 2022
@author: tanu
 """
 #%% Useful links
 # https://stackoverflow.com/questions/41844311/list-of-all-classification-algorithms
 # https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html
 # https://github.com/davidsbatista/machine-learning-notebooks/blob/master/hyperparameter-across-models.ipynb
 # https://scikit-learn.org/stable/modules/svm.html#classification
 # https://machinelearningmastery.com/hyperparameters-for-classification-machine-learning-algorithms/ # [params]
 # https://uk.mathworks.com/help/stats/hyperparameter-optimization-in-classification-learner-app.html [ algo]
 # As a general rule of thumb, it is required to run baseline models on the dataset. I know H2O- AutoML and other AutoML packages do this. But I want to try using Scikit-learn Pipeline,
    #  https://codereview.stackexchange.com/questions/256934/model-pipeline-to-run-multiple-classifiers-for-ml-classification
 # https://uk.mathworks.com/help/stats/hyperparameter-optimization-in-classification-learner-app.html
 # QDA: https://www.geeksforgeeks.org/quadratic-discriminant-analysis/
 names = [
    "Nearest Neighbors",
    "Linear SVM",
    "RBF SVM",
    "Gaussian Process",
    "Decision Tree",
    "Random Forest",
    "Neural Net",
    "AdaBoost",
    "Naive Bayes",
    "QDA",
 ]
 classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
 ]
 # NOTE Logistic regression
 # The choice of the algorithm depends on the penalty chosen: Supported penalties by solver:
 #     ‘newton-cg’ - [‘l2’, ‘none’]
 #     ‘lbfgs’ - [‘l2’, ‘none’]
 #     ‘liblinear’ - [‘l1’, ‘l2’]
 #     ‘sag’ - [‘l2’, ‘none’]
 #     ‘saga’ - [‘elasticnet’, ‘l1’, ‘l2’, ‘none’]
 # SVR?
 # estimator=SVR(kernel='rbf')
 # param_grid={
 #             'C': [1.1, 5.4, 170, 1001],
 #             'epsilon': [0.0003, 0.007, 0.0109, 0.019, 0.14, 0.05, 8, 0.2, 3, 2, 7],
 #             'gamma': [0.7001, 0.008, 0.001, 3.1, 1, 1.3, 5]
 #         }
 #%% Classification algorithms param grid
 #%% LogisticRegression()
 #https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
    gs_lr = Pipeline((
    ('pre' , MinMaxScaler())
    ,('clf', LogisticRegression(**rs
                                , **njobs))
    ))
    gs_lr_params = {
        'clf__C'        : [0.0001, 0.001, 0.01, 0.1 ,1, 10, 100]
        #'C': np.logspace(-4, 4, 50)
        , 'clf__penalty': ['l1', 'l2', 'elasticnet', 'none']
        , 'clf__solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
        }      
 #%% DecisionTreeClassifier()
    gs_dt = Pipeline((
    ('pre'  , MinMaxScaler())
    , ('clf', DecisionTreeClassifier(**rs
                                     , **njobs))
    ))
    gs_dt_params = {
            'clf__max_depth': [ 2,  4,  6, 8, 10]
            , 'clf__criterion':['gini','entropy']
            , "clf__max_features":["auto", None]
            , "clf__max_leaf_nodes":[10,20,30,40]
            }    
 #%% KNeighborsClassifier()
 #https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
    gs_knn = Pipeline((
    ('pre' , MinMaxScaler())
    ,('clf', KNeighborsClassifier(**rs
                                  , **njobs))
    ))
    gs_knn_params = {
    'clf__n_neighbors': [3, 7, 10]
    #, 'clf__n_neighbors': range(1, 21, 2)
    ,'clf__metric'     : ['euclidean', 'manhattan', 'minkowski']
    , 'clf__weights'    : ['uniform', 'distance']
    }
 #%% RandomForestClassifier()
    gs_rf = Pipeline((
    ('pre' , MinMaxScaler())
    ,('clf', RandomForestClassifier(**rs
                                    , **njobs
                                    , bootstrap = True
                                    , oob_score = True))
    ))
    gs_rf_params = {
        'clf__max_depth': [4, 6, 8, 10, 12, 16, 20, None]
        , 'clf__class_weight':['balanced','balanced_subsample']
        , 'clf__n_estimators': [10, 100, 1000]
        , 'clf__criterion': ['gini', 'entropy']
        , 'clf__max_features': ['auto', 'sqrt']
        , 'clf__min_samples_leaf': [2, 4, 8, 50]
        , 'clf__min_samples_split': [10, 20]
        }
 #%% XGBClassifier()
 # https://stackoverflow.com/questions/34674797/xgboost-xgbclassifier-defaults-in-python
 # https://stackoverflow.com/questions/34674797/xgboost-xgbclassifier-defaults-in-python
    gs_xgb = Pipeline((
    ('pre' , MinMaxScaler())
    ,('clf', XGBClassifier(**rs
                           , **njobs))
    ))
    gs_xgb_params = {
        'clf__learning_rate': [0.01, 0.05, 0.1, 0.2]
        , 'clf__max_depth': [4, 6, 8, 10, 12, 16, 20]
        , 'clf__min_samples_leaf': [4, 8, 12, 16, 20]
        , 'clf__max_features': ['auto', 'sqrt']
        }  
 #%% MLPClassifier()
 # https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html
    gs_mlp = Pipeline((
    ('pre' , MinMaxScaler())
    ,('clf', MLPClassifier(**rs
                           , **njobs
                           , max_iter = 500))
      ))
    gs_mlp_params = {
        'clf__hidden_layer_sizes': [(1), (2), (3)]
        , 'clf__max_features': ['auto', 'sqrt']
        , 'clf__min_samples_leaf': [2, 4, 8]
        , 'clf__min_samples_split': [10, 20]
        }
 #%% RidgeClassifier()
 # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RidgeClassifier.html
    gs_rc = Pipeline((
    ('pre' , MinMaxScaler()) # CHECK if it wants -1 to 1
    ,('clf', RidgeClassifier(**rs
                           , **njobs))
      ))
    gs_rc_params = {
        'clf__alpha': [0.1, 0.2, 0.5, 0.8, 1.0]
        }
 #%% SVC()
 # https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
    gs_svc = Pipeline((
    ('pre' , MinMaxScaler()) # CHECK if it wants -1 to 1
    ,('clf', SVC(**rs
                 , **njobs))
      ))
    gs_svc_params = {
        'clf__kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} 
       , 'clf__C'    : [50, 10, 1.0, 0.1, 0.01]
       , 'clf__gamma': ['scale', 'auto'] }
 #%% BaggingClassifier()
 #https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html
    gs_bdt = Pipeline((
    ('pre' , MinMaxScaler()) # CHECK if it wants -1 to 1
    ,('clf', BaggingClassifier(**rs
                               , **njobs
                               , bootstrap = True
                               , oob_score = True))
      ))
    gs_bdt_params = {
        'clf__n_estimators'    : [10, 100, 1000]
       # If None, then the base estimator is a DecisionTreeClassifier.
       , 'clf__base_estimator' : ['None', 'SVC()', 'KNeighborsClassifier()']# if none, DT is used
       , 'clf__gamma': ['scale', 'auto'] }
 #%% GradientBoostingClassifier()
 # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html
    gs_gb = Pipeline((
    ('pre' , MinMaxScaler()) # CHECK if it wants -1 to 1
    ,('clf', GradientBoostingClassifier(**rs))
      ))
    gs_bdt_params = {
        'clf__n_estimators'   : [10, 100, 1000]
        , 'clf__n_estimators' : [10, 100, 1000]
        , 'clf__learning_rate': [0.001, 0.01, 0.1]
        , 'clf__subsample'    : [0.5, 0.7, 1.0]
        , 'clf__max_depth'     : [3, 7, 9]
        }
 #%% AdaBoostClassifier()
 #https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html#sklearn.ensemble.AdaBoostClassifier
    gs_gb = Pipeline((
    ('pre' , MinMaxScaler()) # CHECK if it wants -1 to 1
    ,('clf', AdaBoostClassifier(**rs))
      ))
    gs_bdt_params = {
        'clf__n_estimators': [none, 1, 2]
       , 'clf__base_estiamtor'  : ['None', 1*SVC(), 1*KNeighborsClassifier()]
       #, 'clf___splitter'  :   ["best", "random"]
        }
 #%% GaussianProcessClassifier()
 # https://scikit-learn.org/stable/modules/generated/sklearn.gaussian_process.GaussianProcessClassifier.html
    #GaussianProcessClassifier(1.0 * RBF(1.0)),
    gs_gpc = Pipeline((
    ('pre' , MinMaxScaler()) # CHECK if it wants -1 to 1
    ,('clf', GaussianProcessClassifier(**rs))
      ))
    gs_gpc_params = {
        'clf__kernel': [1*RBF(), 1*DotProduct(), 1*Matern(), 1*RationalQuadratic(), 1*WhiteKernel()]
        }
 #%% GaussianNB()
 # https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html
    gs_gnb = Pipeline((
    ('pre' , MinMaxScaler())
    , ('pca', PCA() )# CHECK if it wants -1 to 1
    ,('clf', GaussianNB(**rs))
      ))
    gs_gnb_params = {
        'clf__priors': [None]
        , 'clf__var_smoothing': np.logspace(0,-9, num=100)
        }
 #%% QuadraticDiscriminantAnalysis()
 #https://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis.html
    gs_qda = Pipeline((
    ('pre' , MinMaxScaler())
    #, ('pca', PCA() )# CHECK if it wants -1 to 1
    ,('clf', QuadraticDiscriminantAnalysis())
      ))
 #%% BernoulliNB()
 # https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.BernoulliNB.html
    gs_gnb = Pipeline((
    ('pre' , MinMaxScaler())
    ,('clf', BernoulliNB())
      ))
 BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
    gs_gnb_params = {
        'clf__alpha': [0, 1]
        , 'clf__binarize':['None', 0]
        , 'clf__fit_prior': [True]
        , 'clf__class_prior': ['None']
        }
--- a/earlier_versions/GSCV_base
+++ b/earlier_versions/GSCV_base
@ -0,0 +1,128 @@
 # Logistic regression: 
    pnca
    input: numerical features
    output: dm/om: target
 grid search/base estimator with a single model with hyperparamter choices: gives you the best model based on a SINGLE metric!
    -- question: which is the metric to optimise for?
 base estimator with multipe models and multiple hyperparams: returns the OVERALL best model-hyperparam combo, based on a single score?
    -- question: which is the metric to optimise for?
 # Demonstration
 ###################
 # Metric1: accuracy
 ###################
 Best model:
 {'clf__max_iter': 100, 'clf__solver': 'liblinear'}
 Best models score:
 0.7145320197044336
 ###################
 # Metric2: F1
 ###################
 Best model:
 {'clf__max_iter': 100, 'clf__solver': 'saga'}
 Best models score:
 0.7550294183111348
 ###################
 # Metric3: Recall
 ###################
 Best model:
 {'clf__max_iter': 100, 'clf__solver': 'saga'}
 Best models score:
 0.8216666666666667
 ###################
 # Metric4: ROC_AUC
 ###################
 Best model:
 {'clf__max_iter': 200, 'clf__solver': 'sag'}
 Best models score:
 0.7711904761904762
 ###################
 # Metric5: MCC
 ###################
 Best model:
 {'clf__max_iter': 100, 'clf__solver': 'saga'}
 Best models score:
 0.4322970173039572
 sklearn/linear_model/_sag.py:354: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
 ConvergenceWarning,
 #####################################
 # Same thing but using: CLFSwitcher()
 ###################
 # Metric1: Accuracy
 ###################
 Best model:
 {'clf__estimator': LogisticRegression(random_state=42, solver='liblinear')
 , 'clf__estimator__max_iter': 100, 'clf__estimator__solver': 'liblinear'}
 Best models score:
 0.7219298245614035
 ###################
 # Metric2: F1
 ###################
 Best model:
 {'clf__estimator': LogisticRegression(random_state=42, solver='liblinear'), 'clf__estimator__max_iter': 100, 'clf__estimator__solver': 'liblinear'}
 print('Best models score:\n', gscv.best_score_)
 Best models score:
 0.7585724070894442
 ###################
 # Metric3: Recall
 ###################
 Best model:
 {'clf__estimator': LogisticRegression(random_state=42, solver='liblinear')
 , 'clf__estimator__max_iter': 100, 'clf__estimator__solver': 'liblinear'}
 Best models score:
 0.8198610213316095
 ###################
 # Metric4: ROC_AUC
 ###################
 Best model:
 {'clf__estimator': LogisticRegression(solver='newton-cg')
 , 'clf__estimator__max_iter': 100, 'clf__estimator__solver': 'newton-cg'}
 Best models score:
 nan
 ###################
 # Metric5: MCC
 ###################
 Best model:
 {'clf__estimator': LogisticRegression(random_state=42, solver='liblinear')
 , 'clf__estimator__max_iter': 100, 'clf__estimator__solver': 'liblin
 Best models score:
 0.4480248700902755
 print('Best model:\n', gs_dt.best_params_)
 Best model:
 {'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'max_leaf_nodes': 10}
 print('Best models score:\n', gs_dt.best_score_)
 Best models score:
 0.43290518915746007
--- a/earlier_versions/MultClassPipe.py
+++ b/earlier_versions/MultClassPipe.py
@ -0,0 +1,109 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Fri Mar  4 15:25:33 2022
@author: tanu
 """
 #%%
 import os, sys
 import pandas as pd
 from sklearn.linear_model import LogisticRegression
 from sklearn.naive_bayes import BernoulliNB
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.svm import SVC
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
 from sklearn.neural_network import MLPClassifier
 from sklearn.pipeline import Pipeline
 from xgboost import XGBClassifier
 from sklearn.preprocessing import StandardScaler
 from sklearn.preprocessing import MinMaxScaler
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score 
 from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef
 #%%
 rs = {'random_state': 42}
 # TODO: add preprocessing step with one hot encoder
 # Multiple Classification - Model Pipeline
 def MultClassPipeline(X_train, X_test, y_train, y_test):
    log_reg = LogisticRegression(**rs)
    nb = BernoulliNB()
    knn = KNeighborsClassifier()
    svm = SVC(**rs)
    mlp = MLPClassifier(max_iter=500, **rs)
    dt = DecisionTreeClassifier(**rs)
    et = ExtraTreesClassifier(**rs)
    rf = RandomForestClassifier(**rs)
    rf2 = RandomForestClassifier(
                          min_samples_leaf=50,
                          n_estimators=150,
                          bootstrap=True,
                          oob_score=True,
                          n_jobs=-1,
                          random_state=42,
                          max_features='auto')
    xgb = XGBClassifier(**rs, verbosity=0)
    clfs = [
            ('Logistic Regression', log_reg), 
            ('Naive Bayes', nb),
            ('K-Nearest Neighbors', knn), 
            ('SVM', svm), 
            ('MLP', mlp), 
            ('Decision Tree', dt), 
            ('Extra Trees', et), 
            ('Random Forest', rf), 
            ('Random Forest2', rf2), 
            ('XGBoost', xgb)
            ]
    pipelines = []
    scores_df = pd.DataFrame(columns=['Model', 'F1_Score', 'MCC', 'Precision', 'Recall', 'Accuracy', 'ROC_AUC'])
    for clf_name, clf in clfs:
        pipeline = Pipeline(steps=[
                                   ('scaler', MinMaxScaler()),
                                   #('scaler', StandardScaler()),
                                   ('classifier', clf)
                                   ]
                            )
        pipeline.fit(X_train, y_train)
        # Model predictions
        y_pred  = pipeline.predict(X_test)
        # F1-Score
        fscore  = f1_score(y_test, y_pred)
        # Precision
        pres    = precision_score(y_test, y_pred)
        # Recall
        recall   = recall_score(y_test, y_pred)
        # Accuracy
        accu    = accuracy_score(y_test, y_pred)
        # ROC_AUC
        roc_auc = roc_auc_score(y_test, y_pred)
        # Matthews correlation coefficient
        mcc =  matthews_corrcoef(y_test, y_pred)
        pipelines.append(pipeline)
        scores_df = scores_df.append({
                                      'Model'       : clf_name
                                      , 'F1_Score'  : fscore
                                      , 'MCC'       : mcc
                                      , 'Precision' : pres
                                      , 'Recall'    : recall
                                      , 'Accuracy'  : accu
                                      , 'ROC_AUC'   : roc_auc
                                      }
                                     , ignore_index = True)
    return pipelines, scores_df
--- a/earlier_versions/SKF_SSF.txt
+++ b/earlier_versions/SKF_SSF.txt
@ -0,0 +1,48 @@
 # Stratified K-fold vs ShuffleSplit
 https://stackoverflow.com/questions/45969390/difference-between-stratifiedkfold-and-stratifiedshufflesplit-in-sklearn
 In ShuffleSplit, the data is shuffled every time, and then split. This means the test sets may overlap between the splits.
 In SKF, test sets don't overlap
 So, the difference here is that StratifiedKFold just shuffles and splits once, therefore the test sets do not overlap, while StratifiedShuffleSplit shuffles each time before splitting, and it splits n_splits times, the test sets can overlap. 
 Note: the two methods uses "stratified fold" (that why "stratified" appears in both names). It means each part preserves the same percentage of samples of each class (label) as the original data. You can read more at cross_validation documents
 ''' python code '''
 splits = 5
 tx = range(10)
 ty = [0] * 5 + [1] * 5
 from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
 from sklearn import datasets
 kfold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=42)
 shufflesplit = StratifiedShuffleSplit(n_splits=splits, random_state=42, test_size=2)
 print("KFold")
 for train_index, test_index in kfold.split(tx, ty):
    print("TRAIN:", train_index, "TEST:", test_index)
 print("Shuffle Split")
 for train_index, test_index in shufflesplit.split(tx, ty):
    print("TRAIN:", train_index, "TEST:", test_index)
 '''
 Output:
 KFold
 TRAIN: [0 2 3 4 5 6 7 9] TEST: [1 8]
 TRAIN: [0 1 2 3 5 7 8 9] TEST: [4 6]
 TRAIN: [0 1 3 4 5 6 8 9] TEST: [2 7]
 TRAIN: [1 2 3 4 6 7 8 9] TEST: [0 5]
 TRAIN: [0 1 2 4 5 6 7 8] TEST: [3 9]
 Shuffle Split
 TRAIN: [8 4 1 0 6 5 7 2] TEST: [3 9]
 TRAIN: [7 0 3 9 4 5 1 6] TEST: [8 2]
 TRAIN: [1 2 5 6 4 8 9 0] TEST: [3 7]
 TRAIN: [4 6 7 8 3 5 1 2] TEST: [9 0]
 TRAIN: [7 2 6 5 4 3 0 9] TEST: [1 8]
--- a/earlier_versions/comp_results
+++ b/earlier_versions/comp_results
@ -0,0 +1,29 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Thu Mar 10 10:59:36 2022
@author: tanu
 """
 # numerical
 #log_reg  (rs)
 F1_score     0.713380
 MCC          0.376546
 Precision    0.687628
 Recall       0.747231
 Accuracy     0.687293
 ROC_curve    0.683199
 #log_reg  (balanced)
 F1_score     0.715106
 MCC          0.390225
 Precision    0.702629
 Recall       0.733445
 Accuracy     0.694309
 ROC_curve    0.691555
 #log_reg  (unbalanced)
 F1_score     0.713380
 MCC          0.376546
 Precision    0.687628
 Recall       0.747231
 Accuracy     0.687293
 ROC_curve    0.683199
--- a/earlier_versions/imports_v1.py
+++ b/earlier_versions/imports_v1.py
@ -0,0 +1,229 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Sun Mar  6 13:41:54 2022
@author: tanu
 """
 import os, sys
 import pandas as pd
 import numpy as np
 #from copy import deepcopy
 from sklearn import linear_model
 from sklearn.linear_model import LogisticRegression, LinearRegression
 from sklearn.naive_bayes import BernoulliNB
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.svm import SVC
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
 from sklearn.neural_network import MLPClassifier
 from xgboost import XGBClassifier
 from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
 from sklearn.compose import ColumnTransformer
 from sklearn.compose import make_column_transformer
 from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score 
 from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef
 from sklearn.metrics import make_scorer
 from sklearn.metrics import classification_report
 from sklearn.metrics import average_precision_score
 from sklearn.model_selection import cross_validate
 from sklearn.model_selection import train_test_split
 from sklearn.model_selection import StratifiedKFold
 from sklearn.pipeline import Pipeline
 from sklearn.pipeline import make_pipeline
 from sklearn.feature_selection import RFE
 from sklearn.feature_selection import RFECV
 import itertools
 import seaborn as sns
 import matplotlib.pyplot as plt
 import numpy as np
 print(np.__version__)
 print(pd.__version__)
 from statistics import mean, stdev, median, mode
 #%%
 homedir = os.path.expanduser("~")
 os.chdir(homedir + "/git/ML_AI_training/")
 # my function
 from MultClassPipe import MultClassPipeline
 from MultClassPipe2 import MultClassPipeline2
 from loopity_loop import MultClassPipeSKF
 gene = 'pncA'
 drug = 'pyrazinamide'
 #==============
 # directories
 #==============
 datadir = homedir + '/git/Data/'
 indir   = datadir + drug + '/input/'
 outdir  = datadir + drug + '/output/'
 #=======
 # input
 #=======
 infile_ml1 = outdir + gene.lower() + '_merged_df3.csv' 
 #infile_ml2 = outdir + gene.lower() + '_merged_df2.csv'
 my_df = pd.read_csv(infile_ml1)
 my_df.dtypes
 my_df_cols = my_df.columns
 geneL_basic     = ['pnca']
 geneL_na        = ['gid']
 geneL_na_ppi2   = ['rpob']
 geneL_ppi2      = ['alr', 'embb', 'katg']
 #%% get cols
 mycols = my_df.columns
 my_df['active_aa_pos'].dtype
 my_df['active_aa_pos'] = my_df['active_aa_pos'].astype(object)
 if gene.lower() in geneL_na_ppi2:
    #D1148 get rid of
    na_index = my_df['mutationinformation'].index[my_df['mcsm_na_affinity'].apply(np.isnan)]
    my_df = my_df.drop(index=na_index)
 #%%============================================================================
 # GET Y
 # Target1: mutation_info_labels
 dm_om_map = {'DM': 1, 'OM': 0}
 target1 = my_df['mutation_info_labels'].map(dm_om_map)
 target1.value_counts()
 # Target2: drug
 drug_labels = drug + '_labels'
 drug_labels
 my_df[drug_labels] = my_df[drug].map({1: 'resistant', 0: 'sensitive'})
 my_df[drug_labels].value_counts()
 my_df[drug_labels] = my_df[drug_labels].fillna('unknown')
 my_df[drug_labels].value_counts()
 target2 = my_df[drug_labels]
 # Target3: drtype [Binary]
 drtype_labels = 'drtype_labels'
 my_df[drtype_labels] = my_df['drtype'].map({'Sensitive'      : 0
                                                 , 'Other'   : 0
                                                 , 'Pre-MDR' : 1
                                                 , 'MDR'     : 1
                                                 , 'Pre-XDR' : 1
                                                 , 'XDR'     : 1})
 # target3 = 'drtype' [Multinomial]
 target3 = my_df[drtype_labels]
 # target4
 drtype_labels2 = 'drtype_labels2'
 my_df[drtype_labels2] = my_df['drtype'].map({'Sensitive'     : 0
                                                 , 'Other'   : 0
                                                 , 'Pre-MDR' : 1
                                                 , 'MDR'     : 1
                                                 , 'Pre-XDR' : 2
                                                 , 'XDR'     : 2})
 target4 = my_df[drtype_labels2]
 # sanity checks
 target1.value_counts()
 my_df['mutation_info_labels'].value_counts()
 target2.value_counts()
 my_df[drug_labels].value_counts()
 target3.value_counts()
 my_df['drtype'].value_counts()
 target4.value_counts()
 my_df['drtype'].value_counts()
 #%%
 # GET X
 common_cols_stabiltyN = ['ligand_distance'
           , 'ligand_affinity_change'
           , 'duet_stability_change'
           , 'ddg_foldx'
           , 'deepddg'
           , 'ddg_dynamut2']
 # Build stability columns ~ gene
 if gene.lower() in geneL_basic:
    x_stabilityN = common_cols_stabiltyN
 if gene.lower() in geneL_ppi2:
    x_stabilityN = common_cols_stabiltyN + ['mcsm_ppi2_affinity'
                                               , 'interface_dist'] 
 if gene.lower() in geneL_na:
    x_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] 
 if gene.lower() in geneL_na_ppi2:
    x_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] 
    #D1148 get rid of
    #na_index = my_df['mutationinformation'].index[my_df['mcsm_na_affinity'].apply(np.isnan)]
    #my_df = my_df.drop(index=na_index)
 X_strFN =  ['asa'
           , 'rsa'
           , 'kd_values'
           , 'rd_values']    
 X_evolFN =  ['consurf_score'
           , 'snap2_score'
           , 'snap2_accuracy_pc']
 # TODO: ADD ED values
 # Problematic due to NA: filling NA with unknown or string will make it categorical
 # OPTIONS
 # 1. Imputing: KNN or MICE or from distribution
 # 2. Fill na with median or mode
 # 3. Separate datset without including genomic features AT ALL for ML, then using this as a 'blind test set'
    # this means the size of the training data gets reduced!
 # 4. Remove genomic features from ML COMPLETELEY!
 # X_genomicFN =  ['af'
 #            , 'or_mychisq'
 #            , 'or_logistic'
 #            , 'or_fisher'
 #            , 'pval_fisher']
 #%% try combinations
 X_vars1 = my_df[x_stabilityN] 
 X_vars2 = my_df[X_strFN] 
 X_vars3 = my_df[X_evolFN] 
 X_vars5  = my_df[x_stabilityN + X_strFN]
 X_vars6  = my_df[x_stabilityN + X_evolFN]
 #X_vars7  = my_df[x_stabilityN + X_genomicFN]
 X_vars8  = my_df[X_strFN + X_evolFN]
 #X_vars9  = my_df[X_strFN + X_genomicFN]
 #X_vars10 = my_df[X_evolFN + X_genomicFN]
 X_vars11 = my_df[x_stabilityN + X_strFN + X_evolFN]
 #X_vars12 = my_df[x_stabilityN + X_strFN + X_evolFN + X_genomicFN]
 numerical_features_names = x_stabilityN + X_strFN + X_evolFN
 # separate ones for foldx?
 categorical_features_names = ['ss_class'
                           , 'wt_prop_water'
                          # , 'lineage_labels' # misleading if using merged_df3
                           , 'mut_prop_water'
                           , 'wt_prop_polarity'
                           , 'mut_prop_polarity'
                           , 'wt_calcprop'
                           , 'mut_calcprop'
                           , 'active_aa_pos']
 numerical_features_df = my_df[numerical_features_names]
 numerical_features_df.shape
 categorical_features_df = my_df[categorical_features_names]
 categorical_features_df.shape
 all_features_df = my_df[numerical_features_names + categorical_features_names]
 all_features_df.shape
--- a/earlier_versions/loopity_detangle.py
+++ b/earlier_versions/loopity_detangle.py
@ -0,0 +1,82 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Thu Mar 10 18:06:34 2022
@author: tanu
 """
 #%%
 models = [
        ('Logistic Regression'  , log_reg) 
        , ('K-Nearest Neighbors', knn) 
        ]
 classification_metrics = {
    'F1_score': []
    ,'MCC': []
    ,'Precision': []
    ,'Recall': []
    ,'Accuracy': []
    ,'ROC_curve': []
    }
 folds=[1,2]
 fold_no=1
 fold_dict={}
 for model_name, model in models:
    fold_dict.update({model_name: {}})
 for f in folds:
    fold=("fold_"+str(fold_no))
    for model_name, model in models:
        print("start of model", model_name, "fold: ", fold)
        fold_dict[model_name].update({fold: {}})
        fold_dict[model_name][fold].update(classification_metrics)
        print("end of model", model_name, "fold: ", fold)
        fold_dict[model_name][fold].update({'F1_score': random.randrange(1,10)})
    fold_no +=1
    pp.pprint(fold_dict)
 #%%
 folds_f1=[]
 for model_name, model in models:
    print("Calculating mean for F1_score for: ", model_name)
    #for key in fold_dict['Logistic Regression']:
        # wrap this in a classification_metric for loop
    for key in fold_dict[model_name]:
        folds_f1.append(fold_dict['Logistic Regression'][key]['F1_score'])
        #folds_f1.append(folds_f1)
        print('key:', key, 'F1scores:', folds_f1)
 mean(folds_f1)
 #%%
 scores_df = pd.DataFrame(columns=['Model', 'F1_Score', 'MCC', 'Precision', 'Recall', 'Accuracy', 'ROC_AUC'])
 # manually
 model_name = 'Logistic Regression'
 model_metric = 'F1_score'
 log_reg_f1 = []
 for key in fold_dict[model_name]:
    log_reg_f1.append(fold_dict[model_name][key][model_metric])
    log_reg_f1M = mean(log_reg_f1)
    print('key:', key, model_metric, ':', log_reg_f1)
 print(log_reg_f1M)
 log_reg_f1df = pd.DataFrame({model_name: [log_reg_f1M]}, index = [model_metric])
 log_reg_f1df
 #%%
 model_metric = 'MCC'
 log_reg_mcc = []
 for key in fold_dict[model_name]:
    log_reg_mcc.append(fold_dict[model_name][key][model_metric])
    log_reg_mccM = mean(log_reg_mcc)
    print('key:', key, model_metric, ':', log_reg_mcc)
 print(log_reg_mccM)
 log_reg_mccdf = pd.DataFrame({model_name: [log_reg_mccM]}, index = [model_metric])
 log_reg_mccdf
--- a/earlier_versions/my_data5_results_pnca
+++ b/earlier_versions/my_data5_results_pnca
@ -0,0 +1,84 @@
 # stabilty [6]
                  Model  F1_Score  Precision    Recall  Accuracy   ROC_AUC
 0  Logistic Regression  0.738854   0.698795  0.783784  0.707143  0.702498
 1          Naive Bayes  0.627451   0.607595  0.648649  0.592857  0.589476
 2  K-Nearest Neighbors  0.731707   0.666667  0.810811  0.685714  0.678133
 3                  SVM  0.729412   0.645833  0.837838  0.671429  0.661343
 4                  MLP  0.670968   0.641975  0.702703  0.635714  0.631654
 5        Decision Tree  0.653595   0.632911  0.675676  0.621429  0.618141
 6          Extra Trees  0.733728   0.652632  0.837838  0.678571  0.668919
 7        Random Forest  0.726190   0.648936  0.824324  0.671429  0.662162
 8              XGBoost  0.704403   0.658824  0.756757  0.664286  0.658681)
 # evolution [3]
                   Model  F1_Score  Precision    Recall  Accuracy   ROC_AUC
 0  Logistic Regression  0.795181   0.717391  0.891892  0.757143  0.748976
 1          Naive Bayes  0.805031   0.752941  0.864865  0.778571  0.773342
 2  K-Nearest Neighbors  0.735484   0.703704  0.770270  0.707143  0.703317
 3                  SVM  0.797619   0.712766  0.905405  0.757143  0.748157
 4                  MLP  0.787879   0.714286  0.878378  0.750000  0.742219
 5        Decision Tree  0.631579   0.615385  0.648649  0.600000  0.597052
 6          Extra Trees  0.688312   0.662500  0.716216  0.657143  0.653563
 7        Random Forest  0.704403   0.658824  0.756757  0.664286  0.658681
 8              XGBoost  0.713376   0.674699  0.756757  0.678571  0.673833)
 # str features [4]
                  Model  F1_Score  Precision    Recall  Accuracy   ROC_AUC
 0  Logistic Regression  0.729412   0.645833  0.837838  0.671429  0.661343
 1          Naive Bayes  0.723926   0.662921  0.797297  0.678571  0.671376
 2  K-Nearest Neighbors  0.662338   0.637500  0.689189  0.628571  0.624898
 3                  SVM  0.727273   0.627451  0.864865  0.657143  0.644554
 4                  MLP  0.710843   0.641304  0.797297  0.657143  0.648649
 5        Decision Tree  0.561151   0.600000  0.527027  0.564286  0.566544
 6          Extra Trees  0.567376   0.597015  0.540541  0.564286  0.565725
 7        Random Forest  0.596026   0.584416  0.608108  0.564286  0.561630
 8              XGBoost  0.630872   0.626667  0.635135  0.607143  0.605446)
 #=========================================================================
 # stability + evolution + str features [13 = 6+3+4]
                   Model  F1_Score  Precision    Recall  Accuracy   ROC_AUC
 0  Logistic Regression  0.726115   0.686747  0.770270  0.692857  0.688165
 1          Naive Bayes  0.730769   0.695122  0.770270  0.700000  0.695741
 2  K-Nearest Neighbors  0.742515   0.666667  0.837838  0.692857  0.684070
 3                  SVM  0.763636   0.692308  0.851351  0.721429  0.713554
 4                  MLP  0.717949   0.682927  0.756757  0.685714  0.681409
 5        Decision Tree  0.671429   0.712121  0.635135  0.671429  0.673628
 6          Extra Trees  0.756410   0.719512  0.797297  0.728571  0.724406
 7        Random Forest  0.742138   0.694118  0.797297  0.707143  0.701679
 8              XGBoost  0.692810   0.670886  0.716216  0.664286  0.661138)
 # stability + evolution [9=6+3]
                   Model  F1_Score  Precision    Recall  Accuracy   ROC_AUC
 0  Logistic Regression  0.729560   0.682353  0.783784  0.692857  0.687346
 1          Naive Bayes  0.743590   0.707317  0.783784  0.714286  0.710074
 2  K-Nearest Neighbors  0.720497   0.666667  0.783784  0.678571  0.672195
 3                  SVM  0.771084   0.695652  0.864865  0.728571  0.720311
 4                  MLP  0.679739   0.658228  0.702703  0.650000  0.646806
 5        Decision Tree  0.620690   0.633803  0.608108  0.607143  0.607084
 6          Extra Trees  0.727273   0.700000  0.756757  0.700000  0.696560
 7        Random Forest  0.734177   0.690476  0.783784  0.700000  0.694922
 8              XGBoost  0.675497   0.662338  0.689189  0.650000  0.647625)
 # stability + str features [10=6+4]
                   Model  F1_Score  Precision    Recall  Accuracy   ROC_AUC
 0  Logistic Regression  0.750000   0.697674  0.810811  0.714286  0.708436
 1          Naive Bayes  0.714286   0.687500  0.743243  0.685714  0.682228
 2  K-Nearest Neighbors  0.687500   0.639535  0.743243  0.642857  0.636773
 3                  SVM  0.743902   0.677778  0.824324  0.700000  0.692465
 4                  MLP  0.716981   0.670588  0.770270  0.678571  0.673014
 5        Decision Tree  0.616438   0.625000  0.608108  0.600000  0.599509
 6          Extra Trees  0.697368   0.679487  0.716216  0.671429  0.668714
 7        Random Forest  0.684211   0.666667  0.702703  0.657143  0.654382
 8              XGBoost  0.666667   0.645570  0.689189  0.635714  0.632473)
 # evolution + str features[7=3+4]
                   Model  F1_Score  Precision    Recall  Accuracy   ROC_AUC
 0  Logistic Regression  0.773006   0.707865  0.851351  0.735714  0.728706
 1          Naive Bayes  0.750000   0.730769  0.770270  0.728571  0.726044
 2  K-Nearest Neighbors  0.737500   0.686047  0.797297  0.700000  0.694103
 3                  SVM  0.763636   0.692308  0.851351  0.721429  0.713554
 4                  MLP  0.775758   0.703297  0.864865  0.735714  0.727887
 5        Decision Tree  0.675497   0.662338  0.689189  0.650000  0.647625
 6          Extra Trees  0.715232   0.701299  0.729730  0.692857  0.690622
 7        Random Forest  0.715232   0.701299  0.729730  0.692857  0.690622
 8              XGBoost  0.721519   0.678571  0.770270  0.685714  0.680590)
--- a/earlier_versions/my_data_modelpipe.py
+++ b/earlier_versions/my_data_modelpipe.py
@ -0,0 +1,156 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Thu Mar  3 17:08:18 2022
@author: tanu
 """
 from sklearn.linear_model import LogisticRegression
 from sklearn.naive_bayes import BernoulliNB
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.svm import SVC
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
 from sklearn.neural_network import MLPClassifier
 from sklearn.pipeline import Pipeline
 from xgboost import XGBClassifier
 from sklearn.preprocessing import StandardScaler
 from sklearn.model_selection import train_test_split
 import os
 from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score
 import pandas as pd
 #%%
 homedir = os.path.expanduser("~")
 os.chdir(homedir + "/git/ML_AI_training/test_data")
 # this needs to be merged_df2 or merged_df3?
 #gene 'pncA'
 drug = 'pyrazinamide'
 my_df = pd.read_csv("pnca_merged_df3.csv")
 my_df.dtypes
 my_df_cols = my_df.columns
 #%%============================================================================
 # GET Y
 # Y = my_df.loc[:,drug] #has NA
 dm_om_map = {'DM': 1, 'OM': 0}
 my_df['resistance'] = my_df['mutation_info_labels'].map(dm_om_map)
 # sanity check
 my_df['resistance'].value_counts()
 my_df['mutation_info_labels'].value_counts()
 Y = my_df['resistance']
 # GET X
 cols = my_df.columns
 X_stability = my_df[['ligand_distance'
           , 'ligand_affinity_change'
           , 'duet_stability_change'
           , 'ddg_foldx'
           , 'deepddg'
           , 'ddg_dynamut2']]
 X_evol =  my_df[['consurf_score'
           , 'snap2_score'
           , 'snap2_accuracy_pc']]
 X_str =  my_df[['asa'
           , 'rsa'
           , 'kd_values'
           , 'rd_values']]
 #%% try combinations
 X_vars = X_stability
 X_vars = X_evol
 X_vars = X_str
 X_vars = pd.concat([X_stability, X_evol, X_str], axis = 1)
 X_vars = pd.concat([X_stability, X_evol], axis = 1)
 X_vars = pd.concat([X_stability, X_str], axis = 1)
 X_vars = pd.concat([X_evol, X_str], axis = 1)
 #%%
 X_vars.shape[1]
 # TODO: stratified cross validate
 # Train-test Split
 rs = {'random_state': 42}
 X_train, X_test, y_train, y_test = train_test_split(X_vars, 
                                                    Y, 
                                                    test_size = 0.33, 
                                                    random_state = 42)
 # Classification - Model Pipeline
 def modelPipeline(X_train, X_test, y_train, y_test):
    log_reg = LogisticRegression(**rs)
    nb = BernoulliNB()
    knn = KNeighborsClassifier()
    svm = SVC(**rs)
    mlp = MLPClassifier(max_iter=500, **rs)
    dt = DecisionTreeClassifier(**rs)
    et = ExtraTreesClassifier(**rs)
    rf = RandomForestClassifier(**rs)
    xgb = XGBClassifier(**rs, verbosity=0)
    clfs = [
            ('Logistic Regression', log_reg), 
            ('Naive Bayes', nb),
            ('K-Nearest Neighbors', knn), 
            ('SVM', svm), 
            ('MLP', mlp), 
            ('Decision Tree', dt), 
            ('Extra Trees', et), 
            ('Random Forest', rf), 
            ('XGBoost', xgb)
            ]
    pipelines = []
    scores_df = pd.DataFrame(columns=['Model', 'F1_Score', 'Precision', 'Recall', 'Accuracy', 'ROC_AUC'])
    for clf_name, clf in clfs:
        pipeline = Pipeline(steps=[
                                   ('scaler', StandardScaler()),
                                   ('classifier', clf)
                                   ]
                            )
        pipeline.fit(X_train, y_train)
        # Model predictions
        y_pred  = pipeline.predict(X_test)
        # F1-Score
        fscore  = f1_score(y_test, y_pred)
        # Precision
        pres    = precision_score(y_test, y_pred)
        # Recall
        rcall   = recall_score(y_test, y_pred)
        # Accuracy
        accu    = accuracy_score(y_test, y_pred)
        # ROC_AUC
        roc_auc = roc_auc_score(y_test, y_pred)
        pipelines.append(pipeline)
        scores_df = scores_df.append({
                                      'Model'     : clf_name, 
                                      'F1_Score'  : fscore,
                                      'Precision' : pres,
                                      'Recall'    : rcall,
                                      'Accuracy'  : accu,
                                      'ROC_AUC'   : roc_auc
                                      }, 
                                     ignore_index = True)
    return pipelines, scores_df
 modelPipeline(X_train, X_test, y_train, y_test)
--- a/earlier_versions/my_data_target_counts.py
+++ b/earlier_versions/my_data_target_counts.py
@ -0,0 +1,81 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Thu Mar  3 17:08:18 2022
@author: tanu
 """
 #%% load packages
 import sys, os
 import pandas as pd
 from pandas import DataFrame
 import numpy as np
 import argparse
 from functools import reduce
 #%%
 homedir = os.path.expanduser("~")
 os.chdir(homedir + "/git/ML_AI_training/test_data")
 #gene = ''
 #drug = ''
 #==============
 # directories
 #==============
 datadir = homedir + '/git/Data/'
 indir = datadir + drug + '/input/'
 outdir = datadir + drug + '/output/'
 # gene_baiscL = ['pnca']
 # geneL_naL   = ['gid', 'rpob']
 # geneL_ppi2L = ['alr', 'embb', 'katg', 'rpob']
 #=======
 # input
 #=======
 infile_ml1 = outdir + gene.lower() + '_merged_df3.csv' 
 #infile_ml2 = outdir + gene.lower() + '_merged_df2.csv'
 my_df = pd.read_csv(infile_ml1)
 my_df.dtypes
 my_df_cols = my_df.columns
 #%%============================================================================
 # GET Y
 drug_labels = drug + '_labels'
 drug_labels
 my_df[drug_labels]  = my_df[drug] 
 my_df[drug_labels].value_counts()
 my_df[drug_labels] = my_df[drug].map({1: 'resistant', 0: 'sensitive'})
 my_df[drug_labels].value_counts()
 my_df[drug_labels] = my_df[drug_labels].fillna('unknown')
 my_df[drug_labels].value_counts()
 mutC = my_df[[ 'mutationinformation']].count()
 target1C = my_df['mutation_info_labels'].value_counts()
 target2C = my_df[drug_labels].value_counts()
 #target2C.index = target2C.index.to_series().map({1: 'resistant', 0: 'sensitive'})
 target3C = my_df['drtype'].value_counts()
 targetsC = pd.concat([mutC, target1C, target2C, target3C])
 targetsC
 # targetsC2 = pd.concat([mutC, target1C, target2C
 #                        #, target3C
 #                        ], axis = 1)
 # targetsC2
 #%% try combinations
 # X_vars = X_stability
 # X_vars = X_evol
 # X_vars = X_str
 # X_vars = pd.concat([X_stability, X_evol, X_str], axis = 1)
 # X_vars = pd.concat([X_stability, X_evol], axis = 1)
 # X_vars = pd.concat([X_stability, X_str], axis = 1)
 # X_vars = pd.concat([X_evol, X_str], axis = 1)
--- a/earlier_versions/my_datap1.py
+++ b/earlier_versions/my_datap1.py
@ -0,0 +1,212 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Thu Feb 24 10:48:10 2022
@author: tanu
 """
 ###############################################################################
 # questions:
 # which data to use: merged_df3 or merged_df2
 # which is the target? or_mychisq or drtype col
 # scaling: can it be from -1 to 1?
 # strategy:
    # available data = X_train
    # available data but NAN = validation_test
    # test data: mut generated not in mcsm
 ###############################################################################
 import os, sys
 import re
 from sklearn.datasets import load_boston
 from sklearn import linear_model
 from sklearn import preprocessing
 import pandas as pd
 import seaborn as sns
 import matplotlib.pyplot as plt
 import numpy as np
 print(np.__version__)
 print(pd.__version__)
 #%% read data
 homedir = os.path.expanduser("~")
 os.chdir(homedir + "/git/ML_AI_training/test_data")
 # this needs to be merged_df2 or merged_df3?
 my_df = pd.read_csv("pnca_all_params.csv")
 my_df.dtypes
 my_df_cols = my_df.columns
 omit_cols1 = ['pdb_file'
             , 'seq_offset4pdb'
             , 'mut_3upper'
             , 'wild_pos'
             , 'wild_chain_pos'
             , 'chain'
             , 'wt_3upper'
             , 'consurf_colour'
             , 'consurf_colour_rev'
             , 'consurf_msa_data'
             , 'consurf_aa_variety'
             , 'snap2_accuracy_pc'
             , 'beta_logistic'
             , 'se_logistic'
             , 'zval_logisitc'
             , 'pval_chisq'
             , 'log10_or_mychisq'
             , 'neglog_pval_fisher'
             , 'wild_type'
             , 'mutant_type'
             , 'position'
             , 'ligand_id'
             , 'mutation'
             , 'ss'
             , 'ss_class' # include it later?
             ]
 omit_cols2 = list(my_df.columns[my_df.columns.str.contains(".*ci_.*") | my_df.columns.str.contains(".*_scaled*") | my_df.columns.str.contains(".*_outcome*")])
 # [WATCH:] just to test since these have negative values!
 omit_cols3 = list(my_df.columns[my_df.columns.str.contains("electro_.*") | my_df.columns.str.contains("disulfide_.*") | my_df.columns.str.contains("hbonds_.*") | my_df.columns.str.contains("partcov_.*") | my_df.columns.str.contains("vdwclashes.*") | my_df.columns.str.contains("volumetric.*")])
 omit_cols = omit_cols1 + omit_cols2 + omit_cols3
 my_df_filt = my_df.loc[:, ~my_df.columns.isin(omit_cols)]
 my_df_filt_cols = my_df_filt.columns
 foo = my_df_filt['or_mychisq'].value_counts() 
 foo = foo.to_frame()
 ########################
 # [WATCH]: Drop na
 my_df2 = my_df_filt.dropna()
 my_df2['resistance'] = my_df2['or_mychisq'].apply(lambda x: 0 if x <=1 else 1)
 my_df2['resistance'].value_counts()
 y = my_df2['resistance']
 #==============================================================================
 omit_cols_y = ['or_mychisq', 'resistance']
 my_df_ml = my_df2.loc[:, ~my_df2.columns.isin(omit_cols_y)]
 #%%############################################################################
 X_train = my_df_ml.set_index('mutationinformation')
 X_train = X_train.iloc[:,:4]
 y_train  = y
 #X_train = X_train.dropna()
 #y_train = y.dropna()
 # check dim
 X_train.shape
 y_train.shape
 #%%=====================================================
 grid = sns.PairGrid(data = pd.concat([X_train
                                    , pd.Series(y_train , name = "resistance")]
                                    , axis = 1))
 grid.map_offdiag(sns.scatterplot)
 grid.map_diag(sns.distplot)
 plt.show()
 model = linear_model.LinearRegression()
 model.fit(X_train, y_train)
 ###################
 #  test set
 X_test = my_df[my_df['or_mychisq'].isnull()]
 #X_test =[ X_test.iloc[:,:4]]
 # HARD part?
 # what should be the test set?
 X_test = [23.9, 0.69, -0.16, 0.59
    , 5, 0.5, 0.4, -1
    , 0.1, 1, 1, 1] 
 X_test_re = np.array(X_test).reshape(3, -1)
 ####################
 fitted = model.predict(X_train)
 model.coef_
 model.predict(X_test_re)
 resid = y_train - fitted
 resid
 #####################
 from sklearn import preprocessing
 scaler = preprocessing.MinMaxScaler()
 scaler.fit(X_train)
 #We can then create a scaled training set
 X_train_scaled = scaler.transform(X_train)
 new_scaled = scaler.transform(X_test_re)
 model.predict(new_scaled)
 #########
 from sklearn.pipeline import Pipeline
 from sklearn.linear_model import LogisticRegression
 # model_pipe = Pipeline(steps = [('preprocess', preprocessing.MinMaxScaler())
 #                           ,('regression', linear_model.LinearRegression())
 #     ])
 model_pipe = Pipeline(steps = [('preprocess', preprocessing.MinMaxScaler())
                              ,('logis', LogisticRegression(class_weight = 'balanced'))
                          ])
 model_pipe.fit(X_train,y_train)
 fitted_vals = model_pipe.predict(X_train)
 # gives the array of predictions
 model_pipe.predict(X_test_re)
 # for Linear Reg only
 # resid = y_train - fitted_vals
 # resid  
 #=====
 # Logistic  1 test
 # FAILS since: the test set dim and input dim should be the same
 # i.e if you give the model 10 features to train on, you will need
 # 10 features to predict something?
 # THINK!!!!
 #=====
 mod_logis = linear_model.LogisticRegression(class_weight = 'balanced')
 mod_logis.fit(X_train,y_train)
 X_test = [23.9] 
 X_test_re = np.array(X_test).reshape(1, -1)
 mod_logis.predict(X_test_re)
 #################
 from sklearn.metrics import accuracy_score, precision_score, recall_score
 y_pred = model_pipe.predict(X_train)
 accuracy_score(y_train,y_pred)
 precision_score(y_train,y_pred,pos_label=1)# tp/(tp + fp) 
 recall_score(y_train,y_pred,pos_label=1) # tp/(tp + fn)
 ########
 # WORKS!
 from sklearn.model_selection import cross_validate
 from sklearn.metrics import make_scorer
 import pandas as pd
 acc = make_scorer(accuracy_score)
 def precision(y_true,y_pred):
    return precision_score(y_true,y_pred,pos_label = 1) #0
 def recall(y_true,y_pred):
    return recall_score(y_true, y_pred, pos_label = 1) #0
 prec = make_scorer(precision)
 rec = make_scorer(recall)
 output = cross_validate(model_pipe
                        , X_train
                        , y_train
                        , scoring = {'acc' : acc
                                     ,'prec' : prec
                                     ,'rec' : rec}
                        , cv = 10, return_train_score = False)
 pd.DataFrame(output).mean()
 # fit_time      0.005486
 # score_time    0.002673
 # test_acc      0.601799
 # test_prec     0.976936
 # test_rec      0.603226
 # dtype: float64
 # the three scores
 # 0.65527950310559
 # 0.9853658536585366
 # 0.6516129032258065
--- a/earlier_versions/my_datap10.py
+++ b/earlier_versions/my_datap10.py
@ -0,0 +1,272 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Sat Mar  5 12:57:32 2022
@author: tanu
 """
 #%%
 # Data, etc for now  comes from my_data6.py and/or my_data5.py
 #%% Specify dir and import functions
 homedir = os.path.expanduser("~")
 os.chdir(homedir + "/git/ML_AI_training/")
 #%% Try combinations
 #import sys, os
 #os.system("imports.py")
 def precision(y_true,y_pred):
    return precision_score(y_true,y_pred,pos_label = 1)
 def recall(y_true,y_pred):
    return recall_score(y_true, y_pred, pos_label = 1)
 def f1(y_true,y_pred):
    return f1_score(y_true, y_pred, pos_label = 1)
 #%% Check df features
 numerical_features_df.shape
 categorical_features_df.shape
 all_features_df.shape
 all_features_df.dtypes
 #%% Simple train and test data splits
 target = target1
 #target = target3
 X_trainN, X_testN, y_trainN, y_testN = train_test_split(numerical_features_df, 
                                                    target, 
                                                    test_size = 0.33, 
                                                    random_state = 42)
 X_trainC, X_testC, y_trainC, y_testC = train_test_split(categorical_features_df, 
                                                    target, 
                                                    test_size = 0.33, 
                                                    random_state = 42)
 X_train, X_test, y_train, y_test = train_test_split(all_features_df, 
                                                    target, 
                                                    test_size = 0.33, 
                                                    random_state = 42)
 #%% Stratified K-fold: Single model
 model1 = Pipeline(steps = [('preprocess', MinMaxScaler())
                               , ('log_reg', LogisticRegression(class_weight = 'balanced')) ])
 model1
 rs = {'random_state': 42}
 log_reg = LogisticRegression(**rs)
 nb = BernoulliNB()
 clfs = [('Logistic Regression', log_reg)
        ,('Naive Bayes', nb)]
 seed_skf = 42
 skf = StratifiedKFold(n_splits = 10
                      , shuffle = True
                      , random_state = seed_skf)
 X_array = np.array(numerical_features_df)
 Y = target1
 model_scores_df = pd.DataFrame()
 fscoreL      = []
 mccL         = []
 presL        = []
 recallL      = []
 accuL        = []
 roc_aucL     = []
 for train_index, test_index in skf.split(X_array, Y):
    x_train_fold, x_test_fold = X_array[train_index], X_array[test_index]
    y_train_fold, y_test_fold = Y[train_index], Y[test_index]
    model1.fit(x_train_fold, y_train_fold)
    y_pred_fold  = model1.predict(x_test_fold)
    #----------------
    # Model metrics
    #----------------     
    # F1-Score
    fscore = f1_score(y_test_fold, y_pred_fold)
    fscoreL.append(fscore)
    fscoreM = mean(fscoreL)
    # Matthews correlation coefficient
    mcc = matthews_corrcoef(y_test_fold, y_pred_fold)
    mccL.append(mcc)
    mccM = mean(mccL)
    # Precision
    pres = precision_score(y_test_fold, y_pred_fold)
    presL.append(pres)
    presM = mean(presL)
    # Recall
    recall = recall_score(y_test_fold, y_pred_fold)
    recallL.append(recall)
    recallM = mean(recallL)            
    # Accuracy
    accu = accuracy_score(y_test_fold, y_pred_fold)
    accuL.append(accu)            
    accuM = mean(accuL)
    # ROC_AUC
    roc_auc = roc_auc_score(y_test_fold, y_pred_fold)
    roc_aucL.append(roc_auc)            
    roc_aucM = mean(roc_aucL)    
 model_scores_df = model_scores_df.append({'Model'      : model1.steps[1][0]
                                          ,'F1_score'  : fscoreM
                                          , 'MCC'      : mccM
                                          , 'Precision': presM
                                          , 'Recall'   : recallM
                                          , 'Accuracy' : accuM
                                          , 'ROC_curve': roc_aucM}
                                         , ignore_index = True)
 print('\nModel metrics:', model_scores_df)                     
 #%% stratified KFold: Multiple_models: 
 input_df = numerical_features_df
 #X_array = np.array(input_df)
 Y = target1
 var_type = 'numerical'
 input_df = all_features_df
 #X_array = np.array(input_df)
 Y = target1
 var_type = 'mixed'
 input_df = categorical_features_df
 #X_array = np.array(input_df)
 Y = target1
 var_type = 'categorical'    
 #=================
 numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
 numerical_ix
 categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns
 categorical_ix    
 # Determine preprocessing steps ~ var_type
 if var_type == 'numerical':
    t = [('num', MinMaxScaler(), numerical_ix)]
 if var_type == 'categorical':
    t = [('cat', OneHotEncoder(), categorical_ix)]
 if var_type == 'mixed':
    t = [('cat', OneHotEncoder(), categorical_ix)
         , ('num', MinMaxScaler(), numerical_ix)]
 ##############################   
 col_transform = ColumnTransformer(transformers = t
                                   , remainder='passthrough')
 rs = {'random_state': 42}
 #log_reg = LogisticRegression(**rs)
 log_reg = LogisticRegression(class_weight = 'balanced')
 nb = BernoulliNB()
 rf = RandomForestClassifier(**rs)
 clfs = [('Logistic Regression', log_reg)
        #,('Naive Bayes', nb)
        #, ('Random Forest'      , rf) 
        ]
 #seed_skf = 42
 skf = StratifiedKFold(n_splits = 10
                      , shuffle = True
                      #, random_state = seed_skf
                      , **rs)
 #scores_df  = pd.DataFrame()
 fscoreL      = []
 mccL         = []
 presL        = []
 recallL      = []
 accuL        = []
 roc_aucL     = []
 for train_index, test_index in skf.split(input_df, Y):
    print('\nSKF train index:', train_index
          , '\nSKF test index:', test_index)
    x_train_fold, x_test_fold = input_df.iloc[train_index], input_df.iloc[test_index]
    y_train_fold, y_test_fold = Y.iloc[train_index], Y.iloc[test_index]
 # for train_index, test_index in skf.split(X_array, Y):
 #      print('\nSKF train index:', train_index
 #            , '\nSKF test index:', test_index)
    # x_train_fold, x_test_fold = X_array[train_index], X_array[test_index]
    # y_train_fold, y_test_fold = Y[train_index], Y[test_index]
    clf_scores_df = pd.DataFrame()
    for clf_name, clf in clfs:   
        # model2 = Pipeline(steps=[('preprocess', MinMaxScaler())
        #                            , ('classifier', clf)])
        model2 = Pipeline(steps=[('preprocess', col_transform)
                                    , ('classifier', clf)])
        model2.fit(x_train_fold, y_train_fold)
        y_pred_fold  = model2.predict(x_test_fold)
        #----------------
        # Model metrics
        #----------------     
        # F1-Score
        fscore = f1_score(y_test_fold, y_pred_fold)
        fscoreL.append(fscore)
        fscoreM = mean(fscoreL)
        # Matthews correlation coefficient
        mcc = matthews_corrcoef(y_test_fold, y_pred_fold)
        mccL.append(mcc)
        mccM = mean(mccL)
        # Precision
        pres = precision_score(y_test_fold, y_pred_fold)
        presL.append(pres)
        presM = mean(presL)
        # Recall
        recall = recall_score(y_test_fold, y_pred_fold)
        recallL.append(recall)
        recallM = mean(recallL)            
        # Accuracy
        accu = accuracy_score(y_test_fold, y_pred_fold)
        accuL.append(accu)            
        accuM = mean(accuL)
        # ROC_AUC
        roc_auc = roc_auc_score(y_test_fold, y_pred_fold)
        roc_aucL.append(roc_auc)            
        roc_aucM = mean(roc_aucL)    
        clf_scores_df = clf_scores_df.append({'Model': clf_name 
                                              ,'F1_score'  : fscoreM
                                              , 'MCC'      : mccM
                                              , 'Precision': presM
                                              , 'Recall'   : recallM
                                              , 'Accuracy' : accuM
                                              , 'ROC_curve': roc_aucM}
                                             , ignore_index = True)
    #scores_df = scores_df.append(clf_scores_df)
 #%% Call functions
 tN_res = MultClassPipeline(X_trainN, X_testN, y_trainN, y_testN)
 tN_res
 t2_res = MultClassPipeline2(X_train, X_test, y_train, y_test, input_df = all_features_df)
 t2_res
 #CHECK: numbers are awfully close to each other!
 t3_res = MultClassPipeSKF(input_df = numerical_features_df
                          , y_targetF = target1
                          , var_type = 'numerical'
                          , skf_splits = 10)
 t3_res
 #CHECK: numbers are awfully close to each other!
 t4_res = MultClassPipeSKF(input_df = all_features_df
                          , y_targetF = target1
                          , var_type = 'mixed'
                          , skf_splits = 10)
 t4_res    
--- a/earlier_versions/my_datap11.py
+++ b/earlier_versions/my_datap11.py
@ -0,0 +1,195 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Sat Mar  5 12:57:32 2022
@author: tanu
 """
 #%%
 # Data, etc for now  comes from my_data6.py and/or my_data5.py
 #%% Specify dir and import functions
 homedir = os.path.expanduser("~")
 os.chdir(homedir + "/git/ML_AI_training/")
 #%% Try combinations
 #import sys, os
 #os.system("imports.py")
 def precision(y_true,y_pred):
    return precision_score(y_true,y_pred,pos_label = 1)
 def recall(y_true,y_pred):
    return recall_score(y_true, y_pred, pos_label = 1)
 def f1(y_true,y_pred):
    return f1_score(y_true, y_pred, pos_label = 1)
 #%% Check df features
 numerical_features_df.shape
 categorical_features_df.shape
 all_features_df.shape
 all_features_df.dtypes
 #%% Simple train and test data splits
 target = target1
 #target = target3
 X_trainN, X_testN, y_trainN, y_testN = train_test_split(numerical_features_df, 
                                                    target, 
                                                    test_size = 0.33, 
                                                    random_state = 42)
 X_trainC, X_testC, y_trainC, y_testC = train_test_split(categorical_features_df, 
                                                    target, 
                                                    test_size = 0.33, 
                                                    random_state = 42)
 X_train, X_test, y_train, y_test = train_test_split(all_features_df, 
                                                    target, 
                                                    test_size = 0.33, 
                                                    random_state = 42)
 #%% Stratified K-fold: Single model
 input_df = numerical_features_df
 #X_array = np.array(input_df)
 var_type = 'numerical'
 input_df = all_features_df
 #X_array = np.array(input_df)
 var_type = 'mixed'
 input_df = categorical_features_df
 #X_array = np.array(input_df)
 var_type = 'categorical'    
 y_targetF = target1
 #==============================================================================
 numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
 numerical_ix
 categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns
 categorical_ix    
 # Determine preprocessing steps ~ var_type
 if var_type == 'numerical':
    t = [('num', MinMaxScaler(), numerical_ix)]
 if var_type == 'categorical':
    t = [('cat', OneHotEncoder(), categorical_ix)]
 if var_type == 'mixed':
    t = [('cat', OneHotEncoder(), categorical_ix)
         , ('num', MinMaxScaler(), numerical_ix)]
 ###############################################################################
 col_transform = ColumnTransformer(transformers = t
                                   , remainder='passthrough')
 ###############################################################################
 rs = {'random_state': 42}
 del(model1)
 model1 = Pipeline(steps = [('preprocess', MinMaxScaler())
                                , ('log_reg', LogisticRegression(class_weight = 'unbalanced')) ])
 # model1 = Pipeline(steps = [('preprocess', MinMaxScaler())
 #                                , ('log_reg', LogisticRegression(**rs)) ])
 del(model1)
 nb      = BernoulliNB()
 model1 = Pipeline(steps = [('preprocess', MinMaxScaler())
                                , ('nb', nb) ])
 del(model1)
 knn     = KNeighborsClassifier()
 model1 = Pipeline(steps = [('preprocess', MinMaxScaler())
                                 , ('knn', knn) ])
 del(model1)
 svm     = SVC(**rs)
 model1 = Pipeline(steps = [('preprocess', MinMaxScaler())
                                , ('svm', svm) ])
 del(model1)
 mlp     = MLPClassifier(max_iter = 500, **rs)
 model1 = Pipeline(steps = [('preprocess', MinMaxScaler())
                                , ('mlp', mlp) ])
 del(model1)
 dt      = DecisionTreeClassifier(**rs)
 model1 = Pipeline(steps = [('preprocess', MinMaxScaler())
                                , ('dt', dt) ])
 del(model1)
 et      = ExtraTreesClassifier(**rs)
 model1 = Pipeline(steps = [('preprocess', MinMaxScaler())
                                , ('et', et) ])
 del(model1)
 rf      = RandomForestClassifier(**rs)
 model1 = Pipeline(steps = [('preprocess', MinMaxScaler())
                                , ('rf', rf) ])
 ###############################################################################
 #%% run 
 del(mm)
 skf = StratifiedKFold(n_splits = 10
                      , shuffle = True
                      , **rs)
 #X_array = np.array(numerical_features_df)
 #Y = target1
 model_scores_df = pd.DataFrame()
 fscoreL      = []
 mccL         = []
 presL        = []
 recallL      = []
 accuL        = []
 roc_aucL     = []
 # for train_index, test_index in skf.split(X_array, Y):
 #     x_train_fold, x_test_fold = X_array[train_index], X_array[test_index]
 #     y_train_fold, y_test_fold = Y[train_index], Y[test_index]
 for train_index, test_index in skf.split(input_df, y_targetF):
    x_train_fold, x_test_fold = input_df.iloc[train_index], input_df.iloc[test_index]
    y_train_fold, y_test_fold = y_targetF.iloc[train_index], y_targetF.iloc[test_index]
    model1.fit(x_train_fold, y_train_fold)
    y_pred_fold  = model1.predict(x_test_fold)
    #----------------
    # Model metrics
    #----------------     
    # F1-Score
    fscore = f1_score(y_test_fold, y_pred_fold)
    fscoreL.append(fscore)
    fscoreM = mean(fscoreL)
    # Matthews correlation coefficient
    mcc = matthews_corrcoef(y_test_fold, y_pred_fold)
    mccL.append(mcc)
    mccM = mean(mccL)
    # Precision
    pres = precision_score(y_test_fold, y_pred_fold)
    presL.append(pres)
    presM = mean(presL)
    # Recall
    recall = recall_score(y_test_fold, y_pred_fold)
    recallL.append(recall)
    recallM = mean(recallL)            
    # Accuracy
    accu = accuracy_score(y_test_fold, y_pred_fold)
    accuL.append(accu)            
    accuM = mean(accuL)
    # ROC_AUC
    roc_auc = roc_auc_score(y_test_fold, y_pred_fold)
    roc_aucL.append(roc_auc)            
    roc_aucM = mean(roc_aucL)    
    model_scores_df = model_scores_df.append({'Model'      : model1.steps[1][0]
                                          ,'F1_score'  : fscoreM
                                          , 'MCC'      : mccM
                                          , 'Precision': presM
                                          , 'Recall'   : recallM
                                          , 'Accuracy' : accuM
                                          , 'ROC_curve': roc_aucM}
                                         , ignore_index = True)
 print('\nModel metrics:\n', model_scores_df)
 mm = model_scores_df.mean()
 print('\nModel metrics mean:\n', mm)
 print('\nModel metrics:\n', model_scores_df)
--- a/earlier_versions/my_datap2.py
+++ b/earlier_versions/my_datap2.py
@ -0,0 +1,179 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Thu Feb 24 10:48:10 2022
@author: tanu
 """
 ###############################################################################
 # questions:
 # which data to use: merged_df3 or merged_df2
 # which is the target? or_mychisq or drtype col
 # scaling: can it be from -1 to 1?
 # how to include the mutation information?
     # 'wild_type', 'mutant', 'postion'
 # whether to log transform the af and or cols 
     # to allow mean mode values to be imputed for validation set
     # whether to calculate mean, median accounting for NA or removing them?
 # strategy:
    # available data = X_train
    # available data but NAN = validation_test
    # test data: mut generated not in mcsm
 ###############################################################################
 import os, sys
 import re
 from sklearn.datasets import load_boston
 from sklearn import linear_model
 from sklearn import preprocessing
 import pandas as pd
 import seaborn as sns
 import matplotlib.pyplot as plt
 import numpy as np
 print(np.__version__)
 print(pd.__version__)
 #%% read data
 homedir = os.path.expanduser("~")
 os.chdir(homedir + "/git/ML_AI_training/test_data")
 # this needs to be merged_df2 or merged_df3?
 my_df = pd.read_csv("pnca_all_params.csv")
 my_df.dtypes
 my_df_cols = my_df.columns
 omit_cols1 = ['pdb_file'
             , 'seq_offset4pdb'
             , 'mut_3upper'
             , 'wild_pos'
             , 'wild_chain_pos'
             , 'chain'
             , 'wt_3upper'
             , 'consurf_colour'
             , 'consurf_colour_rev'
             , 'consurf_msa_data'
             , 'consurf_aa_variety'
             , 'snap2_accuracy_pc'
             , 'beta_logistic'
             , 'se_logistic'
             , 'zval_logisitc'
             , 'pval_chisq'
             , 'log10_or_mychisq'
             , 'neglog_pval_fisher'
             , 'or_fisher'
             , 'wild_type'
             , 'mutant_type'
             , 'position'
             , 'ligand_id'
             , 'mutation'
             , 'ss'
             , 'ss_class' # include it later?
             , 'contacts'
             ]
 omit_cols2 = list(my_df.columns[my_df.columns.str.contains(".*ci_.*") | my_df.columns.str.contains(".*_scaled*") | my_df.columns.str.contains(".*_outcome*")])
 # [WATCH:] just to test since these have negative values!
 omit_cols3 = list(my_df.columns[my_df.columns.str.contains("electro_.*") | my_df.columns.str.contains("disulfide_.*") | my_df.columns.str.contains("hbonds_.*") | my_df.columns.str.contains("partcov_.*") | my_df.columns.str.contains("vdwclashes.*") | my_df.columns.str.contains("volumetric.*")])
 omit_cols = omit_cols1 + omit_cols2 + omit_cols3
 my_df_filt = my_df.loc[:, ~my_df.columns.isin(omit_cols)]
 my_df_filt_cols = my_df_filt.columns
 #fill NaNs with column means in each column 
 my_df_filt2 = my_df_filt.fillna(my_df_filt.mean())
 my_df_filt3 = my_df_filt.fillna(my_df_filt.median())
 my_df_filt_noNA = my_df_filt.fillna(0)
 summ = my_df_filt.describe()
 summ_noNA = my_df_filt_noNA.describe()
 foo = my_df_filt['or_mychisq'].value_counts() 
 foo = foo.to_frame()
 ########################
 # [WATCH]: Drop na
 my_df2 = my_df_filt3.dropna()
 my_df2['resistance'] = my_df2['or_mychisq'].apply(lambda x: 0 if x <=1 else 1)
 my_df2['resistance'].value_counts()
 y = my_df2['resistance']
 y.value_counts()
 #%%============================================================================
 X_validation_muts = my_df['mutationinformation'][~my_df['mutationinformation'].isin(my_df2['mutationinformation'])]
 X_validation_all = my_df_filt3[~my_df_filt3['mutationinformation'].isin(my_df2['mutationinformation'])]
 X_validation_f = X_validation_all.loc[:, ~X_validation_all.columns.isin(['or_mychisq', 'resistance'])]
 X_validation = X_validation_f.set_index('mutationinformation')
 #%% fill na in cols with mean value
 X_validation.info()
 X_validation.isna().any()
 na_df = X_validation_f[X_validation_f.columns[X_validation_f.isna().any()]]
 na_colnames = X_validation_f.columns[X_validation_f.isna().any()]
 na_colsL = list(na_colnames)
 #==============================================================================
 omit_cols_y = ['or_mychisq', 'resistance']
 my_df_ml = my_df2.loc[:, ~my_df2.columns.isin(omit_cols_y)]
 #%%############################################################################
 X_train = my_df_ml.set_index('mutationinformation')
 #X_train = X_train.iloc[:,:4]
 y_train  = y
 #X_train = X_train.dropna()
 #y_train = y.dropna()
 # check dim
 X_train.shape
 y_train.shape
 ###############################################################################
 from sklearn.pipeline import Pipeline
 from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import cross_validate
 from sklearn.metrics import make_scorer
 from sklearn.metrics import accuracy_score, precision_score, recall_score
 model_logisP = Pipeline(steps = [('preprocess', preprocessing.MinMaxScaler())
                              , ('logis', LogisticRegression(class_weight = 'unbalanced'))
                          ])
 model_logisP.fit(X_train, y_train)
 fitted_vals = model_logisP.predict(X_train)
 fitted_vals
 # gives the array of predictions
 model_logisP.predict(X_train)
 model_logisP.predict(X_validation)
 y_pred = model_logisP.predict(X_train)
 y_pred2 = model_logisP.predict(X_validation)
 accuracy_score(y_train,y_pred2)
 precision_score(y_train,y_pred2, pos_label = 1)# tp/(tp + fp) 
 recall_score(y_train,y_pred2, pos_label = 1) # tp/(tp + fn)
 ################
 acc = make_scorer(accuracy_score)
 def precision(y_true,y_pred):
    return precision_score(y_true,y_pred,pos_label = 1) #0
 def recall(y_true,y_pred):
    return recall_score(y_true, y_pred, pos_label = 1) #0
 prec = make_scorer(precision)
 rec = make_scorer(recall)
 output = cross_validate(model_logisP
                        , X_train
                        , y
                        , scoring = {'acc' : acc
                                     ,'prec' : prec
                                     ,'rec' : rec}
                        , cv = 10, return_train_score = False)
 pd.DataFrame(output).mean()
--- a/earlier_versions/my_datap3.py
+++ b/earlier_versions/my_datap3.py
@ -0,0 +1,376 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Thu Feb 24 10:48:10 2022
@author: tanu
 """
 ###############################################################################
 # questions:
 # which data to use: merged_df3 or merged_df2
 # which is the target? or_mychisq or drtype col
 # scaling: can it be from -1 to 1?
 # how to include the mutation information?
     # 'wild_type', 'mutant', 'postion'
 # whether to log transform the af and or cols 
     # to allow mean mode values to be imputed for validation set
     # whether to calculate mean, median accounting for NA or removing them?
 # strategy:
    # available data = X_train
    # available data but NAN = validation_test
    # test data: mut generated not in mcsm
 ###############################################################################
 import os, sys
 import re
 from sklearn.datasets import load_boston
 from sklearn import datasets
 from sklearn import linear_model
 from sklearn import preprocessing
 import pandas as pd
 import seaborn as sns
 import matplotlib.pyplot as plt
 import numpy as np
 print(np.__version__)
 print(pd.__version__)
 from statistics import mean, stdev
 from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import cross_validate
 from sklearn.metrics import make_scorer
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.pipeline import Pipeline
 from sklearn.pipeline import make_pipeline
 from sklearn.datasets import load_digits
 from sklearn.model_selection import train_test_split
 from sklearn.model_selection import GridSearchCV
 from sklearn.metrics import confusion_matrix
 from sklearn.model_selection import StratifiedKFold
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.compose import make_column_transformer
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score
 #%% read data
 homedir = os.path.expanduser("~")
 os.chdir(homedir + "/git/ML_AI_training/test_data")
 # this needs to be merged_df2 or merged_df3?
 my_df = pd.read_csv("pnca_all_params.csv")
 my_df.dtypes
 my_df_cols = my_df.columns
 omit_cols1 = ['pdb_file'
             , 'seq_offset4pdb'
             , 'mut_3upper'
             , 'wild_pos'
             , 'wild_chain_pos'
             , 'chain'
             , 'wt_3upper'
             , 'consurf_colour'
             , 'consurf_colour_rev'
             , 'consurf_msa_data'
             , 'consurf_aa_variety'
             , 'snap2_accuracy_pc'
             , 'beta_logistic'
             , 'se_logistic'
             , 'zval_logisitc'
             , 'pval_chisq'
             , 'log10_or_mychisq'
             , 'neglog_pval_fisher'
             , 'or_fisher'
             , 'wild_type'
             , 'mutant_type'
             , 'position'
             , 'ligand_id'
             , 'mutation'
             , 'ss'
             , 'ss_class' # include it later?
             , 'contacts'
             ]
 omit_cols2 = list(my_df.columns[my_df.columns.str.contains(".*ci_.*") | my_df.columns.str.contains(".*_scaled*") | my_df.columns.str.contains(".*_outcome*")])
 # [WATCH:] just to test since these have negative values!
 omit_cols3 = list(my_df.columns[my_df.columns.str.contains("electro_.*") | my_df.columns.str.contains("disulfide_.*") | my_df.columns.str.contains("hbonds_.*") | my_df.columns.str.contains("partcov_.*") | my_df.columns.str.contains("vdwclashes.*") | my_df.columns.str.contains("volumetric.*")])
 omit_cols = omit_cols1 + omit_cols2 + omit_cols3
 # Filter df: Filter columns to focus on my selected ones
 my_df_filt = my_df.loc[:, ~my_df.columns.isin(omit_cols)]
 my_df_filt_cols = my_df_filt.columns
 #Fill na of filtered df: fill NaNs with column means/medians in each column 
 my_df_filt2 = my_df_filt.fillna(my_df_filt.mean())
 my_df_filt3 = my_df_filt.fillna(my_df_filt.median())
 #my_df_filt_noNA = my_df_filt.fillna(0)
 summ = my_df_filt.describe()
 summ2 = my_df_filt2.describe()
 summ3 = my_df_filt3.describe()
 #summ_noNA = my_df_filt_noNA.describe()
 ########################
 # [WATCH]: Drop na
 # Get Y
 my_df2 = my_df_filt.dropna()
 my_df2['resistance'] = my_df2['or_mychisq'].apply(lambda x: 0 if x <=1 else 1)
 my_df2['resistance'].value_counts()
 Y = my_df2['resistance']
 Y = np.array(Y)
 #Y = Y.reset_index()
 #Y = Y.drop(['index'], axis = 1)
 #Y.value_counts()
 #Y = np.array(Y)
 # GET X
 omit_cols_y = ['or_mychisq', 'resistance']
 my_df_ml = my_df2.loc[:, ~my_df2.columns.isin(omit_cols_y)]
 #my_df_ml = my_df_ml.set_index('mutationinformation')
 X = my_df_ml
 X = X.drop(['mutationinformation'], axis = 1)
 X = np.array(X)
 #X = X.reset_index()
 # check dim
 X.shape
 Y.shape
 my_df2 = my_df2.reset_index()
 #####################
 #https://stackoverflow.com/questions/49134338/kfolds-cross-validation-vs-train-test-split
 rf = RandomForestClassifier(n_estimators=100, random_state=42)
 #https://towardsdatascience.com/stratified-k-fold-what-it-is-how-to-use-it-cf3d107d3ea2
 # k-FOLD
 print('Class Ratio:',
       sum(Y)/len(Y))
 print('Class Ratio:',
       sum(my_df2['resistance'])/len(my_df2['resistance'])
       )
 skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
 target = my_df2.loc[:,'resistance']
 fold_no = 1
 for train_index, test_index in skf.split(my_df2, target):
    train = my_df2.loc[train_index,:]
    test = my_df2.loc[test_index,:]
    print('Fold',str(fold_no),
          'Class Ratio:',
          sum(test['resistance'])/len(test['resistance']))
    fold_no += 1
 model_logisP = Pipeline(steps = [('preprocess', preprocessing.MinMaxScaler())
                              , ('logis', LogisticRegression(class_weight = 'unbalanced'))
                          ])
 X_features = X_train.columns.to_list()
 def train_model(train, test, fold_no):
   X = X_features
   y = ['resistance']
   X_train = train[X]
   y_train = train[y]
   X_test = test[X]
   y_test = test[y]
   model_logisP.fit(X_train,y_train)
   predictions = model_logisP.predict(X_test)
   print('Fold',str(fold_no),
         'Accuracy:',
         accuracy_score(y_test,predictions))
 fold_no = 1
 for train_index, test_index in skf.split(my_df2, target):
    train = my_df2.loc[train_index,:]
    test  = my_df2.loc[test_index,:]
    train_model(train,test,fold_no)
    fold_no += 1
 #%%
 skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=20)
 lst_accu_stratified = []
 scaler = preprocessing.MinMaxScaler()
 X_scaled = scaler.fit_transform(X)
 X_scaled = X_scaled[:,[1,2,3,15,16]]
 #lr = linear_model.LogisticRegression(class_weight = 'unbalanced')
 lr = linear_model.LogisticRegression()
 for train_index1, test_index1 in skf.split(X, Y):
    #print(train_index)
    #print(test_index)
    x_train_fold1, x_test_fold1 = X_scaled[train_index1], X_scaled[test_index1]
    y_train_fold1, y_test_fold1 = Y[train_index1], Y[test_index1]
    lr.fit(x_train_fold1, y_train_fold1)
    lst_accu_stratified.append(lr.score(x_test_fold1, y_test_fold1))
 # print output
 print('List of possible accuracy', lst_accu_stratified)
 print('Max accuracy:', max(lst_accu_stratified)*100, "%")
 print('Min accuracy:', min(lst_accu_stratified)*100, "%")
 print('Mean accuracy:', mean(lst_accu_stratified)*100,"%")
 print('St Dev:', stdev(lst_accu_stratified)*100,"%")
 # cancer data
 cancer = datasets.load_breast_cancer()
 x = cancer.data
 y = cancer.target
 skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
 lst_accu_stratifiedC = []
 scaler = preprocessing.MinMaxScaler()
 x_scaled = scaler.fit_transform(x)
 x_scaled = x_scaled[:,[1,2,3, 15, 16]]
 for train_index, test_index in skf.split(x, y):
    #print(train_index)
    #print(test_index)
    x_train_fold, x_test_fold = x_scaled[train_index], x_scaled[test_index]
    y_train_fold, y_test_fold = y[train_index], y[test_index]
    lr.fit(x_train_fold, y_train_fold)
    lst_accu_stratifiedC.append(lr.score(x_test_fold, y_test_fold))
 # print output
 print('List of possible accuracy', lst_accu_stratifiedC)
 print('Max accuracy:', max(lst_accu_stratifiedC)*100, "%")
 print('Min accuracy:', min(lst_accu_stratifiedC)*100, "%")
 print('Mean accuracy:', mean(lst_accu_stratifiedC)*100,"%")
 print('St Dev:', stdev(lst_accu_stratifiedC)*100,"%")
 #%%
 ##
 # https://towardsdatascience.com/my-random-forest-classifier-cheat-sheet-in-python-fedb84f8cf4f
 y_all = my_df_filt['or_mychisq'].apply(lambda x: 0 if x <=1 else 1)            
 X_all = my_df_filt.drop(['mutationinformation', 'or_mychisq'], axis = 1)
 seed = 20  # so that the result is reproducible
 X_all = my_df_filt.drop(['mutationinformation', 'or_mychisq'], axis = 1)
 X_all =  X_all.iloc[:,:6]
 X_train, X_test, y_train, y_test = train_test_split(X_all,y_all
                                                    , test_size=0.333
                                                    , random_state = seed)
 # Now, it is time to make NA a category.
 # In Python, NaN is considered NAs. 
 # When encoded, those NaN will be ignored. 
 # Hence, it is useful to replace NaN with na, which is now a category called ‘na’. 
 # This will be taken into account when encoding later on.
 #X_train = X_train.fillna('na')
 #X_test = X_test.fillna('na')
 X_train = X_train.fillna(X_train.median())
 X_test = X_test.fillna(X_test.median())
 X_train.dtypes
 features_to_encode = list(X_train.select_dtypes(include = ['object']).columns) 
 col_trans = make_column_transformer(
                        (OneHotEncoder(),features_to_encode),
                        remainder = "passthrough"
                        )
 rf_classifier = RandomForestClassifier(
                      min_samples_leaf=50,
                      n_estimators=150,
                      bootstrap=True,
                      oob_score=True,
                      n_jobs=-1,
                      random_state=seed,
                      max_features='auto')
 pipe = make_pipeline(col_trans, rf_classifier)
 pipe.fit(X_train, y_train)
 y_pred = pipe.predict(X_test)
 accuracy_score(y_test, y_pred)
 print(f"The accuracy of the model is {round(accuracy_score(y_test,y_pred),3)*100} %")
 recall_score(y_test, y_pred)
 precision_score(y_test, y_pred)
 f1_score(y_test, y_pred)
 roc_auc_score (y_test, y_pred)
 roc_curve(y_test, y_pred)
 train_probs = pipe.predict_proba(X_train)[:,1] 
 probs = pipe.predict_proba(X_test)[:, 1]
 train_predictions = pipe.predict(X_train)
 print(f'Train ROC AUC Score: {roc_auc_score(y_train, train_probs)}')
 print(f'Test ROC AUC  Score: {roc_auc_score(y_test, probs)}')
 def evaluate_model(y_pred, probs,train_predictions, train_probs):
    baseline = {}
    baseline['recall']=recall_score(y_test,
                                    [1 for _ in range(len(y_test))])
    baseline['precision'] = precision_score(y_test,
                                    [1 for _ in range(len(y_test))])
    baseline['roc'] = 0.5
    results = {}
    results['recall'] = recall_score(y_test, y_pred)
    results['precision'] = precision_score(y_test, y_pred)
    results['roc'] = roc_auc_score(y_test, probs)
    train_results = {}
    train_results['recall'] = recall_score(y_train,
                                           train_predictions)
    train_results['precision'] = precision_score(y_train, train_predictions)
    train_results['roc'] = roc_auc_score(y_train, train_probs)
    # for metric in ['recall', 'precision', 'roc']:
    #         print(f"Baseline: {round(baseline[metric], 2)}Test: {round(results[metric], 2)} Train: {round(train_results[metric], 2)}")
   # Calculate false positive rates and true positive rates
    base_fpr, base_tpr, _ = roc_curve(y_test, [1 for _ in range(len(y_test))])
    model_fpr, model_tpr, _ = roc_curve(y_test, probs)
    plt.figure(figsize = (8, 6))
    plt.rcParams['font.size'] = 16    
    # Plot both curves
    plt.plot(base_fpr, base_tpr, 'b', label = 'baseline')
    plt.plot(model_fpr, model_tpr, 'r', label = 'model')
    plt.legend();    plt.xlabel('False Positive Rate');
    plt.ylabel('True Positive Rate'); plt.title('ROC Curves');
    plt.show()
 # Recall Baseline: 1.0 Test: 0.92 Train: 0.93 
 # Precision Baseline: 0.48 Test: 0.9 Train: 0.91 
 # Roc Baseline: 0.5 Test: 0.97 Train: 0.97
 evaluate_model(y_pred,probs,train_predictions,train_probs)
 #%%
 import itertools
 def plot_confusion_matrix(cm, classes, normalize = False,
                          title='Confusion matrix',
                          cmap=plt.cm.Greens): # can change color
    plt.figure(figsize = (10, 10))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, size = 24)
    plt.colorbar(aspect=4)
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45, size = 14)
    plt.yticks(tick_marks, classes, size = 14)
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    # Label the plot 
    for i, j in itertools.product(range(cm.shape[0]),   range(cm.shape[1])):    plt.text(j, i, format(cm[i, j], fmt), 
             fontsize = 20,
             horizontalalignment="center",
             color="white" if cm[i, j] > thresh else "black")
    plt.grid(None)
    plt.tight_layout()
    plt.ylabel('True label', size = 18)
    plt.xlabel('Predicted label', size = 18)
 # Let's plot it out
 cm = confusion_matrix(y_test, y_pred)
 plot_confusion_matrix(cm, classes = ['0 - Susceptible', '1 - Resistant'],
                      title = 'R/S Confusion Matrix')
 print(rf_classifier.feature_importances_)
 print(f" There are {len(rf_classifier.feature_importances_)} features in total")
--- a/earlier_versions/my_datap4.py
+++ b/earlier_versions/my_datap4.py
@ -0,0 +1,361 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Thu Feb 24 10:48:10 2022
@author: tanu
 """
 ###############################################################################
 # questions:
 # which data to use: merged_df3 or merged_df2
 # which is the target? or_mychisq or drtype col
 # scaling: can it be from -1 to 1?
 # how to include the mutation information?
     # 'wild_type', 'mutant', 'postion'
 # whether to log transform the af and or cols 
     # to allow mean mode values to be imputed for validation set
     # whether to calculate mean, median accounting for NA or removing them?
 # strategy:
    # available data = X_train
    # available data but NAN = validation_test
    # test data: mut generated not in mcsm
 ###############################################################################
 import os, sys
 import re
 from sklearn.datasets import load_boston
 from sklearn import datasets
 from sklearn import linear_model
 from sklearn import preprocessing
 import pandas as pd
 import seaborn as sns
 import matplotlib.pyplot as plt
 import numpy as np
 print(np.__version__)
 print(pd.__version__)
 from statistics import mean, stdev
 from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import cross_validate
 from sklearn.metrics import make_scorer
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.pipeline import Pipeline
 from sklearn.pipeline import make_pipeline
 from sklearn.datasets import load_digits
 from sklearn.model_selection import train_test_split
 from sklearn.model_selection import GridSearchCV
 from sklearn.metrics import confusion_matrix
 from sklearn.model_selection import StratifiedKFold
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.compose import make_column_transformer
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score
 from sklearn.metrics import plot_precision_recall_curve
 import itertools
 #%% read data
 homedir = os.path.expanduser("~")
 os.chdir(homedir + "/git/ML_AI_training/test_data")
 # this needs to be merged_df2 or merged_df3?
 #gene 'pncA'
 drug = 'pyrazinamide'
 my_df = pd.read_csv("pnca_merged_df3.csv")
 my_df.dtypes
 my_df_cols = my_df.columns
 #%%
 # GET Y
 # Y = my_df.loc[:,drug] #has NA
 dm_om_map = {'DM': 1, 'OM': 0}
 my_df['resistance'] = my_df['mutation_info_labels'].map(dm_om_map)
 # sanity check
 my_df['resistance'].value_counts()
 my_df['mutation_info_labels'].value_counts()
 Y = my_df['resistance']
 #%%
 # GET X
 cols = my_df.columns
 X = my_df[['ligand_distance'
           , 'ligand_affinity_change'
           , 'duet_stability_change', 'ddg_foldx', 'deepddg', 'ddg_dynamut2', 'consurf_score'
           , 'snap2_score'
           #, 'snap2_accuracy_pc'
           , 'asa'
           , 'rsa']]
 #%%
 ####################################
 # SIMPLEST case of train_test split
 # Random forest
 # one hot encoder
 # MinMaxScaler
 # https://towardsdatascience.com/my-random-forest-classifier-cheat-sheet-in-python-fedb84f8cf4f
 ####################################
 seed = 50
 X_train, X_test, y_train, y_test = train_test_split(X,Y
                                                    , test_size    = 0.333
                                                    , random_state = seed)
 features_to_encode = list(X_train.select_dtypes(include = ['object']).columns) 
 col_trans = make_column_transformer(
                        (OneHotEncoder(),features_to_encode),
                        remainder = "passthrough"
                        )
 MinMaxS = preprocessing.MinMaxScaler()
 standardS = preprocessing.StandardScaler()
 rf_classifier = RandomForestClassifier(
                      min_samples_leaf=50,
                      n_estimators=150,
                      bootstrap=True,
                      oob_score=True,
                      n_jobs=-1,
                      random_state=seed,
                      max_features='auto')
 pipe = make_pipeline(col_trans
                     #, MinMaxS
                     #, standardS
                     , rf_classifier)
 pipe.fit(X_train, y_train)
 y_pred = pipe.predict(X_test)                     
 accuracy_score(y_test, y_pred)
 print("\nModel evaluation:\n")
 print(f"Accuracy: {round(accuracy_score(y_test,y_pred),3)*100} %")
 print(f"Recall: {round(recall_score(y_test,y_pred),3)*100} %")
 print(f"Precision: {round(precision_score(y_test,y_pred),3)*100} %")
 print(f"F1-score: {round(f1_score(y_test,y_pred),3)*100} %")
 recall_score(y_test, y_pred)
 precision_score(y_test, y_pred)
 f1_score(y_test, y_pred)
 roc_auc_score (y_test, y_pred) # not sure!
 roc_curve(y_test, y_pred) # not sure!
 disp = plot_precision_recall_curve(pipe, X_test, y_test)
 train_probs = pipe.predict_proba(X_train)[:,1] 
 probs = pipe.predict_proba(X_test)[:, 1]
 train_predictions = pipe.predict(X_train)
 print(f'Train ROC AUC Score: {roc_auc_score(y_train, train_probs)}')
 print(f'Test ROC AUC  Score: {roc_auc_score(y_test, probs)}')
 def evaluate_model(y_pred, probs,train_predictions, train_probs):
    baseline = {}
    baseline['recall']=recall_score(y_test,
                                    [1 for _ in range(len(y_test))])
    baseline['precision'] = precision_score(y_test,
                                    [1 for _ in range(len(y_test))])
    baseline['roc'] = 0.5
    results = {}
    results['recall'] = recall_score(y_test, y_pred)
    results['precision'] = precision_score(y_test, y_pred)
    results['roc'] = roc_auc_score(y_test, probs)
    train_results = {}
    train_results['recall'] = recall_score(y_train,
                                           train_predictions)
    train_results['precision'] = precision_score(y_train, train_predictions)
    train_results['roc'] = roc_auc_score(y_train, train_probs)
    # for metric in ['recall', 'precision', 'roc']:
    #         print(f"Baseline: {round(baseline[metric], 2)}Test: {round(results[metric], 2)} Train: {round(train_results[metric], 2)}")
   # Calculate false positive rates and true positive rates
    base_fpr, base_tpr, _ = roc_curve(y_test, [1 for _ in range(len(y_test))])
    model_fpr, model_tpr, _ = roc_curve(y_test, probs)
    plt.figure(figsize = (8, 6))
    plt.rcParams['font.size'] = 16    
    # Plot both curves
    plt.plot(base_fpr, base_tpr, 'b', label = 'baseline')
    plt.plot(model_fpr, model_tpr, 'r', label = 'model')
    plt.legend();    plt.xlabel('False Positive Rate');
    plt.ylabel('True Positive Rate'); plt.title('ROC Curves');
    plt.show()
 # Recall Baseline: 1.0 Test: 0.92 Train: 0.93 
 # Precision Baseline: 0.48 Test: 0.9 Train: 0.91 
 # Roc Baseline: 0.5 Test: 0.97 Train: 0.97
 evaluate_model(y_pred,probs,train_predictions,train_probs)
 def plot_confusion_matrix(cm, classes, normalize = False,
                          title='Confusion matrix',
                          cmap=plt.cm.Greens): # can change color
    plt.figure(figsize = (10, 10))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, size = 24)
    plt.colorbar(aspect=4)
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45, size = 14)
    plt.yticks(tick_marks, classes, size = 14)
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    # Label the plot 
    for i, j in itertools.product(range(cm.shape[0]),   range(cm.shape[1])):    plt.text(j, i, format(cm[i, j], fmt), 
             fontsize = 20,
             horizontalalignment="center",
             color="white" if cm[i, j] > thresh else "black")
    plt.grid(None)
    plt.tight_layout()
    plt.ylabel('True label', size = 18)
    plt.xlabel('Predicted label', size = 18)
 # Let's plot it out
 cm = confusion_matrix(y_test, y_pred)
 plot_confusion_matrix(cm, classes = ['0 - Susceptible', '1 - Resistant'],
                      title = 'R/S Confusion Matrix')
 print(rf_classifier.feature_importances_)
 print(f" There are {len(rf_classifier.feature_importances_)} features in total")
 #%%
 ####################################
 # Model 2: case of stratified K-fold
 # Logistic regression
 # MinMaxScaler
 # https://towardsdatascience.com/stratified-k-fold-what-it-is-how-to-use-it-cf3d107d3ea2 [ Didn't work!]
 # https://www.geeksforgeeks.org/stratified-k-fold-cross-validation/
 ####################################
 print('Class Ratio:',
       sum(Y)/len(Y))
 print('Class Ratio:',
       sum(my_df['resistance'])/len(my_df['resistance']))
 seed_skf = 50
 skf = StratifiedKFold(n_splits = 10
                      , shuffle = True
                      , random_state = seed_skf)
 lst_accu_stratified = []
 scaler = preprocessing.MinMaxScaler()
 X_scaled = scaler.fit_transform(X)
 #X_scaled = X_scaled[:,[1,2,3]]
 #lr = linear_model.LogisticRegression(class_weight = 'unbalanced')
 lr = linear_model.LogisticRegression()
 for train_index, test_index in skf.split(X, Y):
    #print(train_index)
    #print(test_index)
    x_train_fold, x_test_fold = X_scaled[train_index], X_scaled[test_index]
    y_train_fold, y_test_fold = Y[train_index], Y[test_index]
    lr.fit(x_train_fold, y_train_fold)
    lst_accu_stratified.append(lr.score(x_test_fold, y_test_fold))
 # print output
 print('List of possible accuracy', lst_accu_stratified)
 print('Max accuracy:', max(lst_accu_stratified)*100, "%")
 print('Min accuracy:', min(lst_accu_stratified)*100, "%")
 print('Mean accuracy:', mean(lst_accu_stratified)*100,"%")
 print('St Dev:', stdev(lst_accu_stratified)*100,"%")    
 #%%
 #--------------------------------------
 # Model2.1: same one but with pipeline
 # slightly different results when using 
 # transformed or untransformed values!
 #--------------------------------------
 model_logisP = Pipeline(steps = [('preprocess', preprocessing.MinMaxScaler())
                              , ('logis', LogisticRegression(class_weight = 'unbalanced')) ]) # changes stdev
 seed_skf = 50
 skf = StratifiedKFold(n_splits = 10
                      , shuffle = True
                      , random_state = seed_skf)
 X_array = np.array(X)
 lst_accu_stratified = []
 for train_index, test_index in skf.split(X_array, Y):
    x_train_fold, x_test_fold = X_array[train_index], X_array[test_index]
    y_train_fold, y_test_fold = Y[train_index], Y[test_index]
    model_logisP.fit(x_train_fold, y_train_fold)
    lst_accu_stratified.append(model_logisP.score(x_test_fold, y_test_fold))
 # print output
 print('List of possible accuracy', lst_accu_stratified)
 print('Max accuracy:', max(lst_accu_stratified)*100, "%")
 print('Min accuracy:', min(lst_accu_stratified)*100, "%")
 print('Mean accuracy:', mean(lst_accu_stratified)*100,"%")
 print('St Dev:', stdev(lst_accu_stratified)*100,"%")    
 ####################################
 # Model 3: stratified K-fold
 # Random forest
 # MinMaxScaler
 # X: needs to be an array for str Kfold
 ####################################
 model_rf = Pipeline(steps = [('preprocess', preprocessing.MinMaxScaler())
                              , ('rf'     , RandomForestClassifier(n_estimators=100, random_state=42))])
 seed_skf = 50
 skf = StratifiedKFold(n_splits = 10
                      , shuffle = True
                      , random_state = seed_skf)
 X_array = np.array(X)
 lst_accu_stratified_rf = []
 for train_index, test_index in skf.split(X_array, Y):
    x_train_fold, x_test_fold = X_array[train_index], X_array[test_index]
    y_train_fold, y_test_fold = Y[train_index], Y[test_index]
    model_rf.fit(x_train_fold, y_train_fold)
    lst_accu_stratified_rf.append(model_rf.score(x_test_fold, y_test_fold))
 # print output
 print('List of possible accuracy', lst_accu_stratified_rf)
 print('Max accuracy:', max(lst_accu_stratified_rf)*100, "%")
 print('Min accuracy:', min(lst_accu_stratified_rf)*100, "%")
 print('Mean accuracy:', mean(lst_accu_stratified_rf)*100,"%")
 print('St Dev:', stdev(lst_accu_stratified_rf)*100,"%")    
 ####################################
 # Model 4: Cross validate K-fold
 # Random forest
 # MinMaxScaler
 # X: needs to be an array for Kfold
 # FIXME: DOESNT WORK BECAUSE MSE is for LR, not Logistic or random?
 ####################################
 from sklearn.metrics import mean_squared_error, make_scorer
 from sklearn.model_selection import cross_validate
 score_fn = make_scorer(mean_squared_error)
 scores = cross_validate(model_rf, X_train, y_train
                        , scoring = score_fn
                        , cv = 10)
 from itertools import combinations
 def train(X):
    return cross_validate(model_rf, X, y_train
                          , scoring = score_fn 
                          , cv = 10
                        , return_estimator = True)['test_score']
 scores = [train(X_train.loc[:,vars]) for vars in combinations(X_train.columns,11)]
 means = [score.mean() for score in scores]
 #%%
 # https://stackoverflow.com/questions/52316237/finding-logistic-regression-weights-from-k-fold-cv
 from sklearn.linear_model import LogisticRegressionCV
 from sklearn.model_selection import KFold
 kf = KFold(n_splits=10, shuffle=True, random_state=42)
 logistic = LogisticRegressionCV(Cs=2, fit_intercept=True, cv=kf, verbose =1, random_state=42)
 logistic.fit(X_train, y_train)
 print("Train Coefficient:" , logistic.coef_) #weights of each feature
 print("Train Intercept:" , logistic.intercept_) #value of intercept
 #%%
 # https://machinelearningmastery.com/repeated-k-fold-cross-validation-with-python/
 from sklearn.model_selection import cross_val_score
 from numpy import std
 cv = KFold(n_splits=10, random_state=1, shuffle=True)
 scores = cross_val_score(model_rf, X,Y, scoring='accuracy', cv=cv, n_jobs=-1)
 scores2 = cross_val_score(model_logisP, X, Y, scoring='accuracy', cv=cv, n_jobs=-1)
 # report performance
 print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))
 print('Accuracy: %.3f (%.3f)' % (mean(scores2), stdev(scores2)))
--- a/earlier_versions/my_datap5.py
+++ b/earlier_versions/my_datap5.py
@ -0,0 +1,172 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Thu Mar  3 17:08:18 2022
@author: tanu
 """
 from sklearn.linear_model import LogisticRegression
 from sklearn.naive_bayes import BernoulliNB
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.svm import SVC
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
 from sklearn.neural_network import MLPClassifier
 from sklearn.pipeline import Pipeline
 from xgboost import XGBClassifier
 from sklearn.preprocessing import StandardScaler
 from sklearn.preprocessing import MinMaxScaler
 from sklearn.model_selection import train_test_split
 import os
 from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score
 import pandas as pd
 #%%
 homedir = os.path.expanduser("~")
 os.chdir(homedir + "/git/ML_AI_training/test_data")
 #gene 'pncA'
 #drug = 'pyrazinamide'
 #==============
 # directories
 #==============
 datadir = homedir + '/git/Data/'
 indir = datadir + drug + '/input/'
 outdir = datadir + drug + '/output/'
 #=======
 # input
 #=======
 # this needs to be merged_df2 or merged_df3?
 infile_ml1 = outdir + gene.lower() + '_merged_df3.csv' 
 #infile_ml2 = outdir + gene.lower() + '_merged_df2.csv'
 my_df = pd.read_csv(infile_ml1)
 my_df.dtypes
 my_df_cols = my_df.columns
 gene_baiscL = ['pnca']
 geneL_naL   = ['gid', 'rpob']
 geneL_ppi2L = ['alr', 'embb', 'katg', 'rpob']
 #%%============================================================================
 # GET Y
 # Y = my_df.loc[:,drug] #has NA
 dm_om_map = {'DM': 1, 'OM': 0}
 my_df['resistance'] = my_df['mutation_info_labels'].map(dm_om_map)
 # sanity check
 my_df['resistance'].value_counts()
 my_df['mutation_info_labels'].value_counts()
 Y = my_df['resistance']
 # GET X
 cols = my_df.columns
 X_stability = my_df[['ligand_distance'
           , 'ligand_affinity_change'
           , 'duet_stability_change'
           , 'ddg_foldx'
           , 'deepddg'
           , 'ddg_dynamut2']]
 X_evol =  my_df[['consurf_score'
           , 'snap2_score'
           , 'snap2_accuracy_pc']]
 X_str =  my_df[['asa'
           , 'rsa'
           , 'kd_values'
           , 'rd_values']]
 #%% try combinations
 X_vars = X_stability
 X_vars = X_evol
 X_vars = X_str
 X_vars = pd.concat([X_stability, X_evol, X_str], axis = 1)
 X_vars = pd.concat([X_stability, X_evol], axis = 1)
 X_vars = pd.concat([X_stability, X_str], axis = 1)
 X_vars = pd.concat([X_evol, X_str], axis = 1)
 #%%
 X_vars.shape[1]
 # TODO: stratified cross validate
 # Train-test Split
 rs = {'random_state': 42}
 X_train, X_test, y_train, y_test = train_test_split(X_vars, 
                                                    Y, 
                                                    test_size = 0.33, 
                                                    random_state = 42)
 # Classification - Model Pipeline
 def modelPipeline(X_train, X_test, y_train, y_test):
    log_reg = LogisticRegression(**rs)
    nb = BernoulliNB()
    knn = KNeighborsClassifier()
    svm = SVC(**rs)
    mlp = MLPClassifier(max_iter=500, **rs)
    dt = DecisionTreeClassifier(**rs)
    et = ExtraTreesClassifier(**rs)
    rf = RandomForestClassifier(**rs)
    xgb = XGBClassifier(**rs, verbosity=0)
    clfs = [
            ('Logistic Regression', log_reg), 
            ('Naive Bayes', nb),
            ('K-Nearest Neighbors', knn), 
            ('SVM', svm), 
            ('MLP', mlp), 
            ('Decision Tree', dt), 
            ('Extra Trees', et), 
            ('Random Forest', rf), 
            ('XGBoost', xgb)
            ]
    pipelines = []
    scores_df = pd.DataFrame(columns=['Model', 'F1_Score', 'Precision', 'Recall', 'Accuracy', 'ROC_AUC'])
    for clf_name, clf in clfs:
        pipeline = Pipeline(steps=[
                                   ('scaler', MinMaxScaler()),
                                   ('classifier', clf)
                                   ]
                            )
        pipeline.fit(X_train, y_train)
        # Model predictions
        y_pred  = pipeline.predict(X_test)
        # F1-Score
        fscore  = f1_score(y_test, y_pred)
        # Precision
        pres    = precision_score(y_test, y_pred)
        # Recall
        rcall   = recall_score(y_test, y_pred)
        # Accuracy
        accu    = accuracy_score(y_test, y_pred)
        # ROC_AUC
        roc_auc = roc_auc_score(y_test, y_pred)
        pipelines.append(pipeline)
        scores_df = scores_df.append({
                                      'Model'     : clf_name, 
                                      'F1_Score'  : fscore,
                                      'Precision' : pres,
                                      'Recall'    : rcall,
                                      'Accuracy'  : accu,
                                      'ROC_AUC'   : roc_auc
                                      }, 
                                     ignore_index = True)
    return pipelines, scores_df
 modelPipeline(X_train, X_test, y_train, y_test)
--- a/earlier_versions/my_datap6.py
+++ b/earlier_versions/my_datap6.py
@ -0,0 +1,207 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Fri Mar  4 14:54:30 2022
@author: tanu
 """
 import os, sys
 import pandas as pd
 import numpy as np
 from sklearn.linear_model import LogisticRegression
 from sklearn.naive_bayes import BernoulliNB
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.svm import SVC
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
 from sklearn.neural_network import MLPClassifier
 from sklearn.pipeline import Pipeline
 from xgboost import XGBClassifier
 from sklearn.preprocessing import StandardScaler
 from sklearn.preprocessing import MinMaxScaler
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score
 #%%
 homedir = os.path.expanduser("~")
 os.chdir(homedir + "/git/ML_AI_training/")
 # my function
 from MultClassPipe import MultClassPipeline 
 gene = 'pncA'
 drug = 'pyrazinamide'
 #==============
 # directories
 #==============
 datadir = homedir + '/git/Data/'
 indir   = datadir + drug + '/input/'
 outdir  = datadir + drug + '/output/'
 #=======
 # input
 #=======
 infile_ml1 = outdir + gene.lower() + '_merged_df3.csv' 
 #infile_ml2 = outdir + gene.lower() + '_merged_df2.csv'
 my_df = pd.read_csv(infile_ml1)
 my_df.dtypes
 my_df_cols = my_df.columns
 geneL_basic     = ['pnca']
 geneL_na        = ['gid']
 geneL_na_ppi2   = ['rpob']
 geneL_ppi2      = ['alr', 'embb', 'katg']
 #%% get cols
 mycols = my_df.columns
 #%%============================================================================
 # GET Y
 # Target1: mutation_info_labels
 dm_om_map = {'DM': 1, 'OM': 0}
 target1 = my_df['mutation_info_labels'].map(dm_om_map)
 # Target2: drug
 drug_labels = drug + '_labels'
 drug_labels
 my_df[drug_labels] = my_df[drug].map({1: 'resistant', 0: 'sensitive'})
 my_df[drug_labels].value_counts()
 my_df[drug_labels] = my_df[drug_labels].fillna('unknown')
 my_df[drug_labels].value_counts()
 target2 = my_df[drug_labels]
 # Target3: drtype
 drtype_labels = 'drtype_labels'
 my_df[drtype_labels] = my_df['drtype'].map({'Sensitive' : 0
                                                 , 'Other'   : 0
                                                 , 'Pre-MDR' : 1
                                                 , 'MDR'     : 1
                                                 , 'Pre-XDR' : 1
                                                 , 'XDR'     : 1})
 # target3 = my_df['drtype']
 target3 = my_df[drtype_labels]
 # target4
 drtype_labels2 = 'drtype_labels2'
 my_df[drtype_labels2] = my_df['drtype'].map({'Sensitive'     : 0
                                                 , 'Other'   : 0
                                                 , 'Pre-MDR' : 1
                                                 , 'MDR'     : 1
                                                 , 'Pre-XDR' : 2
                                                 , 'XDR'     : 2})
 target4 = my_df[drtype_labels2]
 # sanity checks
 target1.value_counts()
 my_df['mutation_info_labels'].value_counts()
 target2.value_counts()
 my_df[drug_labels].value_counts()
 target3.value_counts()
 my_df['drtype'].value_counts()
 target4.value_counts()
 my_df['drtype'].value_counts()
 #%%
 # GET X
 common_cols_stabilty = ['ligand_distance'
           , 'ligand_affinity_change'
           , 'duet_stability_change'
           , 'ddg_foldx'
           , 'deepddg'
           , 'ddg_dynamut2']
 # Build stability columns ~ gene
 if gene.lower() in geneL_basic:
    x_stability_cols = common_cols_stabilty
 if gene.lower() in geneL_ppi2:
    x_stability_cols = common_cols_stabilty + ['mcsm_ppi2_affinity'
                                               , 'interface_dist'] 
 if gene.lower() in geneL_na:
    x_stability_cols = common_cols_stabilty + ['mcsm_na_affinity'] 
 if gene.lower() in geneL_na_ppi2:
    x_stability_cols = common_cols_stabilty + ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] 
    #D1148 get rid of
    na_index = my_df['mutationinformation'].index[my_df['mcsm_na_affinity'].apply(np.isnan)]
    my_df = my_df.drop(index=na_index)
 X_strF =  ['asa'
           , 'rsa'
           , 'kd_values'
           , 'rd_values']    
 X_evolF =  ['consurf_score'
           , 'snap2_score'
           , 'snap2_accuracy_pc']
 # TODO: ADD ED values
 # Problematic due to NA
 # X_genomicF =  ['af'
 #            , 'or_mychisq'
 #            , 'or_logistic'
 #            , 'or_fisher'
 #            , 'pval_fisher']
 #%% try combinations
 X_vars1 = my_df[x_stability_cols] 
 X_vars2 = my_df[X_strF] 
 X_vars3 = my_df[X_evolF] 
 #X_vars4 = my_df[X_genomicF] 
 #X_vars4 = X_vars4.fillna('unknown') # need one hot encoder!
 X_vars5  = my_df[x_stability_cols + X_strF]
 X_vars6  = my_df[x_stability_cols + X_evolF]
 #X_vars7  = my_df[x_stability_cols + X_genomicF]
 X_vars8  = my_df[X_strF + X_evolF]
 #X_vars9  = my_df[X_strF + X_genomicF]
 #X_vars10 = my_df[X_evolF + X_genomicF]
 X_vars11 = my_df[x_stability_cols + X_strF + X_evolF]
 #X_vars12 = my_df[x_stability_cols + X_strF + X_evolF + X_genomicF]
 numerical_features_names = x_stability_cols + X_strF + X_evolF
 # separate ones for foldx?
 categorical_features_names = ['ss_class'
                           , 'wt_prop_water'
                          # , 'lineage_labels' # misleading if using merged_df3
                           , 'mut_prop_water'
                           , 'wt_prop_polarity'
                           , 'mut_prop_polarity'
                           , 'wt_calcprop'
                           , 'mut_calcprop'
                           , 'active_aa_pos']
 numerical_features_df = my_df[numerical_features_names]
 numerical_features_df.shape
 categorical_features_df = my_df[categorical_features_names]
 categorical_features_df.shape
 all_features_df = my_df[numerical_features_names + categorical_features_names]
 all_features_df.shape
 #%%
 X_vars1.shape[1]
 X_vars5.shape[1]
 # TODO: stratified cross validate
 # Train-test Split
 # TARGET1
 X_train, X_test, y_train, y_test = train_test_split(X_vars1, 
                                                    target1, 
                                                    test_size = 0.33, 
                                                    random_state = 42)
 t1_res = MultClassPipeline(X_train, X_test, y_train, y_test)
 t1_res
 # TARGET3
 X_train3, X_test3, y_train3, y_test3 = train_test_split(X_vars5, 
                                                    target3, 
                                                    test_size = 0.33, 
                                                    random_state = 42)
 t3_res = MultClassPipeline(X_train3, X_test3, y_train3, y_test3)
 t3_res
 #%%
--- a/earlier_versions/my_datap7.py
+++ b/earlier_versions/my_datap7.py
@ -0,0 +1,207 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Sat Mar  5 12:57:32 2022
@author: tanu
 """
 import os, sys
 import pandas as pd
 import numpy as np
 from sklearn.linear_model import LogisticRegression
 from sklearn.naive_bayes import BernoulliNB
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.svm import SVC
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
 from sklearn.neural_network import MLPClassifier
 from sklearn.pipeline import Pipeline
 from xgboost import XGBClassifier
 from sklearn.preprocessing import StandardScaler
 from sklearn.preprocessing import MinMaxScaler
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score
 from sklearn.model_selection import cross_validate
 from sklearn.metrics import make_scorer
 from sklearn.metrics import classification_report
 from sklearn.feature_selection import RFE
 from sklearn.feature_selection import RFECV
 #############################
 # trying feature selection
 #############################
 #%%
 model= Pipeline(steps = [
    ('pre', MinMaxScaler()),
    ('reg', LogisticRegression(class_weight = 'balanced'))])
 def precision(y_true,y_pred):
    return precision_score(y_true,y_pred,pos_label = 1)
 def recall(y_true,y_pred):
    return recall_score(y_true, y_pred, pos_label = 1)
 def f1(y_true,y_pred):
    return f1_score(y_true, y_pred, pos_label = 1)
 acc  = make_scorer(accuracy_score)
 prec = make_scorer(precision)
 rec  = make_scorer(recall)
 f1   = make_scorer(f1)
 output = cross_validate(model, X_train, y_train
                        , scoring = {'acc' : acc
                                  ,'prec': prec
                                  ,'rec' : rec
                                  ,'f1'  : f1}
                        , cv = 10
                        , return_train_score = False)
 pd.DataFrame(output).mean()
 #%%
 # classification_repor: lowest scores but does it give numbers for all your classes!
 model.fit(X_train, y_train)
 y_pred = model.predict(X_test)
 f1_score(y_test, y_pred)
 roc_auc_score (y_test, y_pred) # not sure!
 #roc_curve(y_test, y_pred)
 classification_report(y_test, y_pred)
 target_names = {1:'Resistant', 0:'Sensitive'}
 print(classification_report(y_test
                            , y_pred
                            #, target_names=y_test.map(target_names)
                            ))
 #%%NOT SURE!
 from itertools import combinations
 def train(X):
    return cross_validate(model, X, y_train
                          #, scoring = make_scorer(accuracy_score)
                          , scoring = {'acc' : acc
                                    ,'prec'  : prec
                                    ,'rec'   : rec
                                    ,'f1'    : f1}
                          , cv = 10
                          , return_train_score = False)
                          #, return_estimator = True)['test_score']
 scores = [train(X_train.loc[:,vars]) for vars in combinations(X_train.columns, len(X_train.columns))]
 means = [score.mean() for score in scores]
 means
 #%%
 # TO TRY
 https://rasbt.github.io/mlxtend/
 # stackoverflow
 # informative post
 https://datascience.stackexchange.com/questions/937/does-scikit-learn-have-a-forward-selection-stepwise-regression-algorithm
 https://datascience.stackexchange.com/questions/24405/how-to-do-stepwise-regression-using-sklearn/24447#24447
 https://stats.stackexchange.com/questions/204141/difference-between-selecting-features-based-on-f-regression-and-based-on-r2
 # 0.24 version, it supports
 https://scikit-learn.org/stable/auto_examples/release_highlights/plot_release_highlights_0_24_0.html#new-sequentialfeatureselector-transformer
 https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectFromModel.html
 https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SequentialFeatureSelector.html
 https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFECV.html
 https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.f_regression.html
 https://www.scikit-yb.org/en/latest/api/model_selection/rfecv.html
 #GridSearchCV
 #ParameterGrid
 #RandomizedSearchCV
 #https://medium.com/analytics-vidhya/hyper-parameter-tuning-gridsearchcv-vs-randomizedsearchcv-499862e3ca5
 #%% RFE: Feature selection in classification
 # others in example
 # https://towardsdatascience.com/feature-selection-techniques-for-classification-and-python-tips-for-their-application-10c0ddd7918b
 # https://towardsdatascience.com/feature-selection-using-python-for-classification-problem-b5f00a1c7028
 model_logistic = LogisticRegression(solver='lbfgs'
                                    , multi_class = 'multinomial'
                                    , max_iter = 1000)
 model_logistic = LogisticRegression()
 sel_rfe_logistic = RFE(estimator = model_logistic
                       , n_features_to_select = 4
                       , step = 1)
 X_train_rfe_logistic = sel_rfe_logistic.fit_transform(X_train, y_train)
 print(sel_rfe_logistic.get_support())
 print(sel_rfe_logistic.ranking_)
 #%% RFECV
 # https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFECV.html
 target = target1
 target = target3
 target = target4
 X_train, X_test, y_train, y_test = train_test_split(X_vars1, 
                                                    target, 
                                                    test_size = 0.33, 
                                                    random_state = 42)
 X_train, X_test, y_train, y_test = train_test_split(X_vars2, 
                                                    target, 
                                                    test_size = 0.33, 
                                                    random_state = 42)
 X_train, X_test, y_train, y_test = train_test_split(X_vars3, 
                                                    target, 
                                                    test_size = 0.33, 
                                                    random_state = 42)
 X_train, X_test, y_train, y_test = train_test_split(X_vars5, 
                                                    target, 
                                                    test_size = 0.33, 
                                                    random_state = 42)
 X_train, X_test, y_train, y_test = train_test_split(X_vars11, 
                                                    target, 
                                                    test_size = 0.33, 
                                                    random_state = 42)
 model_logistic2 = LogisticRegression()
 sel_rfe_logistic = RFECV(estimator = model_logistic2
                       , cv = 10
                       , step = 1)
 X_train_rfe_logistic = sel_rfe_logistic.fit_transform(X_train, y_train)
 print(sel_rfe_logistic.get_support())
 X_train.columns
 print(sel_rfe_logistic.ranking_)
 #%%
 # TODO: imputation
 # Find out the best way to impute values!
 #from sklearn.impute import SimpleImputer
 # https://towardsdatascience.com/whats-the-best-way-to-handle-nan-values-62d50f738fc
 #KNN and MICE
 my_df2 = pd.read_csv(infile_ml1)
 genomicF = ['af'
        , 'beta_logistic'
        , 'or_logistic'
        , 'pval_logistic'
        , 'se_logistic'
        , 'zval_logistic'
        , 'ci_low_logistic'
        , 'ci_hi_logistic'
        , 'or_mychisq'
        , 'log10_or_mychisq'
        , 'or_fisher'
        , 'pval_fisher'
        , 'neglog_pval_fisher'
        , 'ci_low_fisher'
        , 'ci_hi_fisher'
        , 'est_chisq'
        , 'pval_chisq']
 # X_genomicF =  ['af'
 #             , 'or_mychisq'
 #             , 'or_logistic'
 #             , 'or_fisher'
 #             , 'pval_fisher']
 my_df2[genomicF].isna().sum()
 my_df2[genomicF] = my_df2[genomicF].fillna(value='unknown')
--- a/earlier_versions/my_datap8.py
+++ b/earlier_versions/my_datap8.py
@ -0,0 +1,118 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Sat Mar  5 12:57:32 2022
@author: tanu
 """
 #%%
 # data, etc for now  comes from my_data6.py and/or my_data5.py
 #%% try combinations
 #import sys, os
 #os.system("imports.py")
 #%%
 seed = 42
 features_to_encode = list(X_train.select_dtypes(include = ['object']).columns) 
 col_trans = make_column_transformer(
                        (OneHotEncoder(),features_to_encode),
                        remainder = "passthrough"
                        )
 rf_classifier = RandomForestClassifier(
                      min_samples_leaf=50,
                      n_estimators=150,
                      bootstrap=True,
                      oob_score=True,
                      n_jobs=-1,
                      random_state=seed,
                      max_features='auto')
 pipe = make_pipeline(col_trans, rf_classifier)
 pipe.fit(X_train, y_train)
 y_pred = pipe.predict(X_test)
 #%%
 all_features_df.shape
 X_train, X_test, y_train, y_test = train_test_split(all_features_df, 
                                                    target1, 
                                                    test_size = 0.33, 
                                                    random_state = 42)
 preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler()  , numerical_features_df)
        ,('cat', OneHotEncoder(), categorical_features_df)])
 seed = 42
 rf_classifier = RandomForestClassifier(
                      min_samples_leaf=50,
                      n_estimators=150,
                      bootstrap=True,
                      oob_score=True,
                      n_jobs=-1,
                      random_state=seed,
                      max_features='auto')
 preprocessor.fit(all_features_df)
 preprocessor.transform(all_features_df)
 model = Pipeline(steps = [
    ('preprocess', preprocessor)
    ,('regression',linear_model.LogisticRegression())
    ])
 model.fit(X_train, y_train)
 y_pred = model.predict(X_test)
 y_pred
 def precision(y_true,y_pred):
    return precision_score(y_true,y_pred,pos_label = 1)
 def recall(y_true,y_pred):
    return recall_score(y_true, y_pred, pos_label = 1)
 def f1(y_true,y_pred):
    return f1_score(y_true, y_pred, pos_label = 1)
 acc  = make_scorer(accuracy_score)
 prec = make_scorer(precision)
 rec  = make_scorer(recall)
 f1   = make_scorer(f1)
 output = cross_validate(model, X_train, y_train
                        , scoring = {'acc' : acc
                                  ,'prec': prec
                                  ,'rec' : rec
                                  ,'f1'  : f1}
                        , cv = 10
                        , return_train_score = False)
 pd.DataFrame(output).mean()
 #%% with feature selection
 preprocessor.fit(numerical_features_df)
 preprocessor.transform(numerical_features_df)
 model = Pipeline(steps = [
    ('preprocess', preprocessor)
    ,('regression',linear_model.LogisticRegression())
    ])
 selector_logistic = RFECV(estimator = model
                       , cv = 10
                       , step = 1)
 X_trainN, X_testN, y_trainN, y_testN = train_test_split(numerical_features_df 
                                                    , target1 
                                                    , test_size = 0.33
                                                    , random_state = 42)
 selector_logistic_xtrain = selector_logistic.fit_transform(X_trainN, y_trainN)
 print(sel_rfe_logistic.get_support())
 X_trainN.columns
 print(sel_rfe_logistic.ranking_)
--- a/earlier_versions/my_datap9.py
+++ b/earlier_versions/my_datap9.py
@ -0,0 +1,144 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Sat Mar  5 12:57:32 2022
@author: tanu
 """
 #%%
 # data, etc for now  comes from my_data6.py and/or my_data5.py
 #%% try combinations
 #import sys, os
 #os.system("imports.py")
 def precision(y_true,y_pred):
    return precision_score(y_true,y_pred,pos_label = 1)
 def recall(y_true,y_pred):
    return recall_score(y_true, y_pred, pos_label = 1)
 def f1(y_true,y_pred):
    return f1_score(y_true, y_pred, pos_label = 1)
 #%%
 numerical_features_df.shape
 categorical_features_df.shape
 all_features_df.shape
 #%%
 target = target1
 #target = target3
 X_trainN, X_testN, y_trainN, y_testN = train_test_split(numerical_features_df, 
                                                    target, 
                                                    test_size = 0.33, 
                                                    random_state = 42)
 X_trainC, X_testC, y_trainC, y_testC = train_test_split(categorical_features_df, 
                                                    target, 
                                                    test_size = 0.33, 
                                                    random_state = 42)
 X_train, X_test, y_train, y_test = train_test_split(all_features_df, 
                                                    target, 
                                                    test_size = 0.33, 
                                                    random_state = 42)
 #%%
 #%%
 preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler()  , numerical_features_names)
        ,('cat', OneHotEncoder(), categorical_features_names)
        ], remainder = 'passthrough')
 f = preprocessor.fit(numerical_features_df)
 f2 = preprocessor.transform(numerical_features_df)
 f3 = preprocessor.fit_transform(numerical_features_df)
 (f3==f2).all()
 preprocessor.fit_transform(numerical_features_df)
 #preprocessor.fit_transform(all_features_df)
 #%%
 model_log = Pipeline(steps = [
    ('preprocess', preprocessor)
    #,('log_reg', linear_model.LogisticRegression())
    ,('log_reg', LogisticRegression(
        class_weight = 'unbalanced'))
    ])
 model = model_log
 #%%
 seed = 42
 model_rf = Pipeline(steps = [
    ('preprocess', preprocessor)
    ,('rf',  RandomForestClassifier(
                          min_samples_leaf=50,
                          n_estimators=150,
                          bootstrap=True,
                          oob_score=True,
                          n_jobs=-1,
                          random_state=seed,
                          max_features='auto'))
    ])
 model = model_rf
 #%%
 model.fit(X_trainN, y_trainN)
 y_pred = model.predict(X_testN)
 y_pred
 acc  = make_scorer(accuracy_score)
 prec = make_scorer(precision)
 rec  = make_scorer(recall)
 f1   = make_scorer(f1)
 output = cross_validate(model, X_trainN, y_trainN
                        , scoring = {'acc' : acc
                                  ,'prec': prec
                                  ,'rec' : rec
                                  ,'f1'  : f1}
                        , cv = 10
                        , return_train_score = False)
 pd.DataFrame(output).mean()
 #%% Run multiple models using MultClassPipeline
 # only good for numerical features as categ features is not supported yet!
 t1_res = MultClassPipeline2(X_trainN, X_testN, y_trainN, y_testN, input_df = all_features_df)
 t1_res
 #%%
 # https://machinelearningmastery.com/columntransformer-for-numerical-and-categorical-data/
 #Each transformer is a three-element tuple that defines the name of the transformer, the transform to apply, and the column indices to apply it to. For example:
 # (Name, Object, Columns)
 # Determine categorical and numerical features
 numerical_ix = all_features_df.select_dtypes(include=['int64', 'float64']).columns
 numerical_ix
 categorical_ix = all_features_df.select_dtypes(include=['object', 'bool']).columns
 categorical_ix
 # Define the data preparation for the columns
 t = [('cat', OneHotEncoder(), categorical_ix)
     , ('num', MinMaxScaler(), numerical_ix)]
 col_transform = ColumnTransformer(transformers=t
                                  , remainder='passthrough')
 # create pipeline (unlike example above where the col transfer was a preprocess step and it was fit_transformed)             
 pipeline = Pipeline(steps=[('prep', col_transform)
                                   , ('classifier', LogisticRegression())])
 #%% Added this to the MultClassPipeline
 tN_res = MultClassPipeline(X_trainN, X_testN, y_trainN, y_testN)
 tN_res
 t2_res = MultClassPipeline2(X_train, X_test, y_train, y_test, input_df = all_features_df)
 t2_res
 t3_res = MultClassPipeSKF(input_df = numerical_features_df
                          , y_targetF = target1
                          , var_type = 'numerical'
                          , skf_splits = 10)
 t3_res
 t4_res = MultClassPipeSKF(input_df = all_features_df
                          , y_targetF = target1
                          , var_type = 'mixed'
                          , skf_splits = 10)
 t4_res
--- a/earlier_versions/p_jr_d1.py
+++ b/earlier_versions/p_jr_d1.py
@ -0,0 +1,405 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Thu Feb 17 14:52:55 2022
@author: tanu
 """
 from sklearn.datasets import load_boston
 from sklearn import linear_model
 from sklearn import preprocessing
 import pandas as pd
 import seaborn as sns
 import matplotlib.pyplot as plt
 import numpy as np
 print(np.__version__)
 print(pd.__version__)
 boston = load_boston()
 dir(boston)
 #['DESCR', 'data', 'data_module', 'feature_names', 'filename', 'target']
 X, y = boston.data, boston.target
 df = pd.DataFrame(X, columns = boston.feature_names)
 df['MEDV'] = y
 sns.scatterplot(x = 'CRIM', y = 'MEDV', data = df)
 plt.show()
 #Model fitting
 #To fit a model using just a single predictor we first extract the training variables.
 X_train = df['CRIM']
 y_train = y
 # Unfortunately, sklearn ’s various model fitting functions typically expect a
 # two dimensional array for the covariates. Since we have extracted only
 # a single feature here it is only one dimensional. We need to reshape the
 # X_train values to be the appropriate shape.
 # This is not necessary if using more than a single feature.
 if len(X_train.values.shape) == 1:
    X_train = X_train.values.reshape(-1, 1)
 # Create a LinearRegression object: This object is of a broader class of estima-
 #tor objects.
 model = linear_model.LinearRegression()
 model.fit(X_train, y_train)
 # We can make predictions from our fitted model with the .predict() method.
 new_value = np.array(4.09, ndmin = 2)
 model.predict(new_value)
 multiple_values = np.array([1, 2, 3], ndmin = 2).T
 model.predict(multiple_values)
 #Fitted values
 #Fitted values of a model typically describes the predicted ŷ for the obser-
 #vations X . To get the model fitted values we could just predict from the
 #model using the values used to train it.
 fitted = model.predict(X_train)
 ax = sns.scatterplot(x = 'CRIM', y = 'MEDV', data = df)
 sns.lineplot(df['CRIM'], fitted, ax = ax)
 plt.show()
 # Interpreting the coefficients
 # The coefficients of the fitted model are kept in the model.coef_ attribute.
 # This gives us the expected change in y for a unit change in X .
 model.coef_
 #2.3 Multiple linear regression
 X_train = df.iloc[:,:3]
 grid = sns.PairGrid(data=pd.concat([X_train,pd.Series(y_train,name="MEDV")],axis = 1))
 grid.map_offdiag(sns.scatterplot)
 grid.map_diag(sns.distplot)
 plt.show()
 model.fit(X_train, y_train)
 new_values = np.array(X_train.mean(), ndmin = 2)
 model.predict(new_values)
 #Residuals
 #In classical statistics, one of our assump-
 #tions it that the residuals are normally dis-
 #tributed.Small RSS implies the fitted model is
 #closer to the observations.
 fitted = model.predict(X_train)
 resid = y_train - fitted
 # Standardise to remove effect of measurement scale
 resid = (resid - np.mean(resid))/np.std(resid,ddof = 1)
 plt.figure()
 for i in range(3):
    xvar = X_train.iloc[:,i]
    ax = plt.subplot(3, 1, i + 1)
    ax.scatter(xvar, resid)
    ax.set_xlabel(boston.feature_names[i])
    ax.set_ylabel("Residuals")
    ax.hlines([-2, 0, 2], np.min(xvar), np.max(xvar))
 plt.show()
 plt.figure()
 ax = plt.subplot(3, 1, 1)
 ax.scatter(fitted,resid)
 ax.set_xlabel('Fitted values')
 ax.set_ylabel('Residuals')
 ax = plt.subplot(3,1,2)
 ax.scatter(fitted,y_train)
 ax.set_xlabel('Fitted values')
 ax.set_ylabel('Predicted values')
 ax = plt.subplot(3, 1,3)
 import scipy.stats as stats
 stats.probplot(resid,dist = 'norm',plot = ax)
 plt.show()
 #Scaling data: many types available
 # sklearn comes with many preprocessing transformations in the sklearn.preprocessing module
 #Scaling is crucial for many statistical and machine learning algorithms
 # • k-means and hierarchical clustering
 # – Data units & variance play crucial role in cluster selection
 # • Using gradient descent optimization
 # – Scaled data allows the weights to update at an equal speed
 # • Scaled data allows the regression coefficients to be compared
 #########################################################
 # Min-max scaling
 # DOESN'T change the shape
 # DOES change the bounds, mean and sd
 # NOT often used in LR
 # used more in GDO (gradient Descent Optimisation)
 # sklearn.preprocessing module has a MinMaxScaler() for this
 ##########################################################
 np.random.seed(1)
 x_n = np.random.normal(2, 5, 500) 
 x_t = np.random.standard_t(2, 500) 
 x_ln = np.random.lognormal(1, 1, 500) 
 df = pd.DataFrame({ 'Normal': x_n, 'T': x_t, 'Lognormal': x_ln
 })
 df_long = df.melt(var_name='Distribution')
 g = sns.FacetGrid(df_long, col='Distribution',sharex=False)
 g.map(plt.hist, 'value', bins = 50)
 plt.show()
 def min_max(x):
    min = np.min(x)
    s = (x - min)/(np.max(x) - min)
    return (s)
 scaled = df.apply(min_max).melt(var_name='Distribution')
 scaled['Scaled'] = True
 df_long['Scaled'] = False
 full_data = pd.concat([df_long, scaled], axis=0)
 g = sns.FacetGrid(full_data, col='Distribution'
                  ,row='Scaled'
                  , sharex=False
                  , sharey=False)
 g.map(plt.hist, 'value', bins = 50)
 plt.show()
 df.apply([np.mean,np.std])
 df.apply(min_max).apply([np.mean,np.std])
 # sklearn: MinMaxScaler()
 scaler = preprocessing.MinMaxScaler()
 scaler.fit(X_train)
 X_train_scaled = scaler.transform(X_train)
 X_train_scaled[:1]
 ##########################################################
 # z-score standardisation
 # DOESN'T change the shape
 # popular in linear models
 # DOESN'T effect the predictions
 # but makes the size of the coeffs directly comparable
 # sklearn.preprocessing module has a StandardScaler() for this
 ##########################################################
 def z_score(x):
    mean = np.mean(x)
    std = np.std(x, ddof=1)
    return (x - mean)/std
 scaled = df.apply(z_score).melt(var_name='Distribution')
 scaled['Scaled'] = True
 full_data = pd.concat([df_long, scaled], axis=0)
 g = sns.FacetGrid(full_data, col='Distribution'
                  , row ='Scaled'
                  , sharex=False
                  ,sharey=False)
 g.map(plt.hist, 'value', bins=50)
 ###############################################
 # Dividing by two standard deviations
 # http://www.stat.columbia.edu/
 # ~gelman/research/published/ standardizing7.pdf
 # One of the downsides of scaling data by z-scoring is that is not obvious
 # how this should be handled in the case of categorical variables.
 # suggest the use of a rescaling that divides numeric vari-
 # ables by two standard deviations, whilst leaving binary encoded categorical
 # variables untransformed.
 # nothing in sklearn for this
 ###############################################
 from sklearn.base import BaseEstimator, TransformerMixin
 class two_sd_scaler(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.stds = 2*np.std(X, axis=0, ddof=1)
        return self
    def transform(self, X, y=None):
        return X/self.stds
 # Having preprocessed the data this way we can not fit a model to it in the
 # same way as before.
 model2 = linear_model.LinearRegression()
 model2.fit(X_train_scaled, y_train)
 #When making predictions on new values we also need to make sure to pass
 #the new values through the same preprocessing step.
 new_value = np.array(X_train.mean(), ndmin = 2)
 new_scaled = scaler.transform(new_value)
 pred = model2.predict(new_scaled)
 pred
 ##########################
 # 2.5 Creating a pipeline
 ##########################
 # For any training data set and any data for prediction we will want to apply
 # the same scaling transformation and use the same model. We could create
 # a sklearn.pipeline.Pipeline() to organise the steps to creating the
 # estimator
 from sklearn.pipeline import Pipeline
 model = Pipeline(steps = [('preprocess', preprocessing.StandardScaler())
                          ,('regression', linear_model.LinearRegression())
                          ])
 # Having created the Pipeline object we can now fit as before. Calling
 # .fit() now however, will first fit the 'preprocess' step and then the
 # 'regression' step. When we predict, the new values will also pass through
 # both stages of our pipeline.
 model.fit(X_train,y_train)
 new_values = np.array(X_train.mean(), ndmin = 2)
 model.predict(new_values)
 #from sklearn.metrics import accuracy_score
 #print(accuracy_score(y_test, model.predict(X_test)))
 #2.6 Preprocessing categorical variables
 # One hot encoding: will take a categorical feature with K categories and
 # create a ‘one of K ’ encoding scheme. I.e a set of binary variables for each
 # category. Consider the toy data
 toy = pd.DataFrame({
 'category':['a', 'a', 'b', 'c', 'b']
 })
 enc = preprocessing.OneHotEncoder()
 enc.fit(toy)
 enc.transform(toy).toarray()
 #Combining preprocessing steps: 
 # the preprocessing steps into a single operation
 # for our Pipeline using a sklearn.compose.ColumnTransformer
 toy = pd.DataFrame({
 'numeric': [1., 2., 3., 4., 5.],
 'category': ['a', 'a', 'b', 'c', 'b']
 })
 from sklearn.compose import ColumnTransformer
 from sklearn.preprocessing import StandardScaler, OneHotEncoder
 numeric_features = ['numeric']
 categorical_features = ['category']
 preprocessor = ColumnTransformer(transformers=[('num', StandardScaler()
                                                , numeric_features)
                                               ,('cat', OneHotEncoder(), categorical_features)])
 preprocessor.fit(toy)
 preprocessor.transform(toy)
 # This preprocessing step could then be a step in the pipeline for a regres-
 # sion 
 model = Pipeline(steps = [('preprocess', preprocessor)
                          ,('regression', linear_model.LinearRegression())])
 # fit the preprocessor pipeline to the data
 preprocessor.fit(toy)
 # transformer will now give the appropriate pre-processing for different types of variables.
 preprocessor.transform(toy)
 #This preprocessing step could then be a step in the pipeline for a regression
 model = Pipeline(steps = [('preprocess', preprocessor)
         ,('regression', linear_model.LinearRegression())])
 #Model Assessment and Feature Selection
 #%%#####################################################################
 # Accuracy score is only for classification problems.
 # For regression problems you can use: R2 Score, MSE (Mean Squared Error), RMSE (Root Mean Squared Error).
 from sklearn import datasets
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import StandardScaler
 from sklearn.decomposition import PCA
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.pipeline import Pipeline
 from sklearn import preprocessing
 # read data
 iris = datasets.load_iris()
 # assign X and y
 X = iris.data
 y = iris.target
 # split data into train and testing part (25 % of data is test size of the data)
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)
 # preprocess the data 
 # scaling
 scaler = preprocessing.MinMaxScaler()
 # fit X_train to scaling
 scaler.fit(X_train)
 # Apply the scaling/transforamtion to the dta
 X_train_scaled = scaler.transform(X_train)
 # Choose the required model/s
 model2 = linear_model.LinearRegression() # Classification metrics can't handle a mix of multiclass and continuous targets
 model2 = DecisionTreeClassifier()
 # fit the model to the data for predictions
 model2.fit(X_train_scaled, y_train)
 # check model performace
 print(accuracy_score(y_test, model2.predict(X_test)))
 #When making predictions on new values we also need to make sure to pass
 #the new values through the same preprocessing step.
 new_value = np.array(X_train.mean(), ndmin = 2)
 new_scaled = scaler.transform(new_value)
 pred = model2.predict(new_scaled)
 pred
 # or Create a pipeline that standardizes the data then creates a model
 # make a pipeline
 # PCA(Dimension reduction to two) -> Scaling the data -> DecisionTreeClassification
 #https://www.geeksforgeeks.org/pipelines-python-and-scikit-learn/
 pipe1 = Pipeline([('pca', PCA(n_components = 2))
                 , ('std', StandardScaler())
                 , ('decision_tree', DecisionTreeClassifier())]
                 , verbose = True)
 pipe2 = Pipeline(steps = [('preprocess', preprocessing.StandardScaler())
                          #,('regression', linear_model.LinearRegression())
                          ,('rf', RandomForestClassifier())
                          ])
 # fit pipeline to TRAINING data [X_train and y_train]
 pipe1.fit(X_train, y_train)
 pipe2.fit(X_train, y_train)
 # model prediction on TEST data [X_test and y_test]
 print(accuracy_score(y_test, pipe1.predict(X_test)))
 print(accuracy_score(y_test, pipe2.predict(X_test)))
 print(pipe2.classification_report (y_test, np.argmax(predicted, axis = 1))) 
 enc = preprocessing.OneHotEncoder()
 enc.fit(X_train)
 enc.transform(X_train).toarray()
 #%%
 from sklearn.metrics import mean_squared_error, make_scorer
 from sklearn.model_selection import cross_validate
 from sklearn.linear_model import LinearRegression
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import StandardScaler
 from sklearn.preprocessing import MinMaxScaler
 boston = load_boston()
 X_train, y_train = pd.DataFrame(boston.data, columns = boston.feature_names), boston.target
 model1 = Pipeline(steps = [
    ('pre', MinMaxScaler()),
    ('reg', LinearRegression())])
 score_fn = make_scorer(mean_squared_error)
 scores = cross_validate(model1, X_train, y_train
                        , scoring = score_fn
                        , cv = 10)
 from itertools import combinations
 def train(X):
    return cross_validate(model1, X, y_train
                          , scoring = score_fn
                          #, return_train_score = False)
                          , return_estimator = True)['test_score']
 scores = [train(X_train.loc[:,vars]) for vars in combinations(X_train.columns, 12)]
 means = [score.mean() for score in scores]
 means
--- a/earlier_versions/p_jr_d2.py
+++ b/earlier_versions/p_jr_d2.py
@ -0,0 +1,7 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Wed Feb 23 11:13:45 2022
@author: tanu
 """
--- a/earlier_versions/pnca_results_v1.py
+++ b/earlier_versions/pnca_results_v1.py
@ -0,0 +1,99 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Mon Mar  7 15:20:42 2022
@author: tanu
 """
 fit_time      0.008588
 score_time    0.004460
 test_acc      0.690148
 test_prec     0.690868
 test_rec      0.771250
 test_f1       0.725441
 # RF
 fit_time      0.368793
 score_time    0.110153
 test_acc      0.672537
 test_prec     0.664875
 test_rec      0.790417
 test_f1       0.720224
 dtype: float64
 #%%
 numerical_features: ['ligand_distance', 'ligand_affinity_change'
 , 'duet_stability_change', 'ddg_foldx', 'deepddg', 'ddg_dynamut2'
 , 'asa', 'rsa', 'kd_values', 'rd_values'
 , 'consurf_score', 'snap2_score', 'snap2_accuracy_pc']
                  Model  F1_Score  Precision    Recall  Accuracy   ROC_AUC
 0  Logistic Regression  0.734177   0.690476  0.783784  0.700000  0.694922
 1          Naive Bayes  0.467290   0.757576  0.337838  0.592857  0.608313
 2  K-Nearest Neighbors  0.773006   0.707865  0.851351  0.735714  0.728706
 3                  SVM  0.766467   0.688172  0.864865  0.721429  0.712735
 4                  MLP  0.725000   0.674419  0.783784  0.685714  0.679771
 5        Decision Tree  0.662069   0.676056  0.648649  0.650000  0.650082
 6          Extra Trees  0.748387   0.716049  0.783784  0.721429  0.717649
 7        Random Forest  0.722581   0.691358  0.756757  0.692857  0.688984
 8       Random Forest2  0.731707   0.666667  0.810811  0.685714  0.678133
 9              XGBoost  0.692810   0.670886  0.716216  0.664286  0.661138)
 all_features: numerical_features + ['ss_class', 'wt_prop_water', 'mut_prop_water', 'wt_prop_polarity',
       'mut_prop_polarity', 'wt_calcprop', 'mut_calcprop', 'active_aa_pos']
                  Model  F1_Score  Precision    Recall  Accuracy   ROC_AUC
 0  Logistic Regression  0.757764   0.701149  0.824324  0.721429  0.715192
 1          Naive Bayes  0.620690   0.633803  0.608108  0.607143  0.607084
 2  K-Nearest Neighbors  0.619355   0.592593  0.648649  0.578571  0.574324
 3                  SVM  0.766467   0.688172  0.864865  0.721429  0.712735
 4                  MLP  0.738854   0.698795  0.783784  0.707143  0.702498
 5        Decision Tree  0.666667   0.701493  0.635135  0.664286  0.666052
 6          Extra Trees  0.728395   0.670455  0.797297  0.685714  0.678952
 7        Random Forest  0.763636   0.692308  0.851351  0.721429  0.713554
 8       Random Forest2  0.746988   0.673913  0.837838  0.700000  0.691646
 9              XGBoost  0.710526   0.692308  0.729730  0.685714  0.683047)
 #%%
                  Model     F1_Score   Precision Recall    Accuracy  ROC_AUC
 0Num  Logistic Regression  0.734177   0.690476  0.783784  0.700000  0.694922
 0All  Logistic Regression  0.757764   0.701149  0.824324  0.721429  0.715192
 1Num          Naive Bayes  0.467290   0.757576  0.337838  0.592857  0.608313
 1All          Naive Bayes  0.620690   0.633803  0.608108  0.607143  0.607084
 2Num  K-Nearest Neighbors  0.773006   0.707865  0.851351  0.735714  0.728706 ** 'Num' is better than 'All'
 2All  K-Nearest Neighbors  0.619355   0.592593  0.648649  0.578571  0.574324 
 3Num                 SVM  0.766467   0.688172  0.864865  0.721429  0.712735
 3All                 SVM  0.766467   0.688172  0.864865  0.721429  0.712735
 4Num                  MLP  0.725000   0.674419  0.783784  0.685714  0.679771
 4All                  MLP  0.738854   0.698795  0.783784  0.707143  0.702498
 5Num        Decision Tree  0.662069   0.676056  0.648649  0.650000  0.650082 ** marginal, equivalent
 5All        Decision Tree  0.666667   0.701493  0.635135  0.664286  0.666052
 6Num          Extra Trees  0.748387   0.716049  0.783784  0.721429  0.717649 ** marginal, equivalent
 6All          Extra Trees  0.728395   0.670455  0.797297  0.685714  0.678952
 7Num        Random Forest  0.722581   0.691358  0.756757  0.692857  0.688984
 7All        Random Forest  0.763636   0.692308  0.851351  0.721429  0.713554
 8Num       Random Forest2  0.731707   0.666667  0.810811  0.685714  0.678133
 8All       Random Forest2  0.746988   0.673913  0.837838  0.700000  0.691646
 9Num              XGBoost  0.692810   0.670886  0.716216  0.664286  0.661138)
 9All              XGBoost  0.710526   0.692308  0.729730  0.685714  0.683047)
 #%%
                  Model  F1_Score  Precision    Recall  Accuracy   ROC_AUC
 0  Logistic Regression  0.757764   0.701149  0.824324  0.721429  0.715192
 1          Naive Bayes  0.628571   0.666667  0.594595  0.628571  0.630631
 2  K-Nearest Neighbors  0.666667   0.623529  0.716216  0.621429  0.615684
 3                  SVM  0.766467   0.688172  0.864865  0.721429  0.712735
 4                  MLP  0.726115   0.686747  0.770270  0.692857  0.688165
 5        Decision Tree  0.647482   0.692308  0.608108  0.650000  0.652539
 6          Extra Trees  0.760736   0.696629  0.837838  0.721429  0.714373
 7        Random Forest  0.736196   0.674157  0.810811  0.692857  0.685708
 8       Random Forest2  0.736196   0.674157  0.810811  0.692857  0.685708
 9              XGBoost  0.710526   0.692308  0.729730  0.685714  0.683047)
--- a/earlier_versions/practice_cv.py
+++ b/earlier_versions/practice_cv.py
--- a/earlier_versions/practice_d1.py
+++ b/earlier_versions/practice_d1.py
@ -0,0 +1,69 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Mon Feb 21 13:06:25 2022
@author: tanu
 """
 X_train
 scaler = preprocessing.MinMaxScaler()
 scaler.fit(X_train)
 x_train_scaled = scaler.transform(X_train)
 x_train_scaled
 foo = scaler.fit(X_train)
 x_train_scaled2  = foo.transform(X_train)
 x_train_scaled2
 (x_train_scaled == x_train_scaled2).all()
 toy = pd.DataFrame({
 'numeric': [1., 2., 3., 4., 5.],
 'category': ['a', 'a', 'b', 'c', 'b']
 })
 numeric_features = ['numeric']
 categorical_features = ['category']
 preprocessor = ColumnTransformer(transformers=[('num', StandardScaler(), numeric_features),
 ('cat', OneHotEncoder(), categorical_features)
 ])
 preprocessor.fit(toy)
 bar = preprocessor.transform(toy)
 bar
 #############
 toy2 = pd.DataFrame({
 'numeric': [1., 2., 3., 4., 5.],
 'numeric2': [1., 2., 3., 4., 6.],
 'category': ['a', 'a', 'b', 'c', 'b'],
 'category2': ['b', 'a', 'b', 'e', 'f']
 })
 numeric_features = ['numeric', 'numeric2']
 categorical_features = ['category', 'category2']
 preprocessor = ColumnTransformer(transformers=[
 ('num', StandardScaler(), numeric_features),
 ('cat', OneHotEncoder(), categorical_features)
 ])
 preprocessor.fit(toy2)
 bar2 = preprocessor.transform(toy2)
 bar2
 ####
 import pandas as pd
 from pandas import DataFrame
 import numpy as np
 from sklearn.decomposition import PCA
 from pandas import DataFrame
 pca = PCA(n_components = 2)
 pca.fit(toy2.iloc[:, 0:2])
 columns = ['pca_%i' % i for i in range(2)]
 df_pca = DataFrame(pca.transform(toy2.iloc[:, 0:2])
                   , columns=columns
                   , index=toy2.index)
 df_pca.head()
--- a/earlier_versions/skf_mm.py
+++ b/earlier_versions/skf_mm.py
@ -0,0 +1,161 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Thu Mar 10 10:33:15 2022
@author: tanu
 """
 #%% Stratified KFold: Multiple_models: 
 input_df = numerical_features_df
 #X_array = np.array(input_df)
 var_type = 'numerical'
 input_df = all_features_df
 #X_array = np.array(input_df)
 var_type = 'mixed'
 input_df = categorical_features_df
 #X_array = np.array(input_df)
 var_type = 'categorical'    
 targetF = target1
 #==============================================================================
 numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
 numerical_ix
 categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns
 categorical_ix    
 # Determine preprocessing steps ~ var_type
 if var_type == 'numerical':
    t = [('num', MinMaxScaler(), numerical_ix)]
 if var_type == 'categorical':
    t = [('cat', OneHotEncoder(), categorical_ix)]
 if var_type == 'mixed':
    t = [('cat', OneHotEncoder(), categorical_ix)
         , ('num', MinMaxScaler(), numerical_ix)]
 ###############################################################################  
 col_transform = ColumnTransformer(transformers = t
                                   , remainder='passthrough')
 ###############################################################################
 rs = {'random_state': 42}
 #log_reg = LogisticRegression(**rs)
 log_reg = LogisticRegression(class_weight = 'balanced')
 nb = BernoulliNB()
 rf = RandomForestClassifier(**rs)
 clfs = [('Logistic Regression', log_reg)
        ,('Naive Bayes'       , nb)
        , ('Random Forest'    , rf) 
        ]
 #seed_skf = 42
 skf = StratifiedKFold(n_splits = 10
                      , shuffle = True
                      #, random_state = seed_skf
                      , **rs)
 #scores_df  = pd.DataFrame()
 fscoreL      = []
 mccL         = []
 presL        = []
 recallL      = []
 accuL        = []
 roc_aucL     = []
 # X_array = np.array(input_df)
 # Y = np.array(target1)
 # Y = target1
 for train_index, test_index in skf.split(input_df, targetF):
    print('\nSKF train index:', train_index
          , '\nSKF test index:', test_index)
    x_train_fold, x_test_fold = input_df.iloc[train_index], input_df.iloc[test_index]
    y_train_fold, y_test_fold = targetF.iloc[train_index], targetF.iloc[test_index]
 # for train_index, test_index in skf.split(X_array, Y):
 #      print('\nSKF train index:', train_index
 #            , '\nSKF test index:', test_index)
    # x_train_fold, x_test_fold = X_array[train_index], X_array[test_index]
    # y_train_fold, y_test_fold = Y[train_index], Y[test_index]
    clf_scores_df = pd.DataFrame()
    for clf_name, clf in clfs:   
        # model2 = Pipeline(steps=[('preprocess', MinMaxScaler())
        #                            , ('classifier', clf)])
        model2 = Pipeline(steps=[('preprocess', col_transform)
                                    , ('classifier', clf)])
        model2.fit(x_train_fold, y_train_fold)
        y_pred_fold  = model2.predict(x_test_fold)
        #----------------
        # Model metrics
        #----------------     
        # F1-Score
        fscore = f1_score(y_test_fold, y_pred_fold)
        fscoreL.append(fscore)
 #        print('fscoreL Len: ', len(fscoreL))
        fscoreM = mean(fscoreL)
        # Matthews correlation coefficient
        mcc = matthews_corrcoef(y_test_fold, y_pred_fold)
        mccL.append(mcc)
        mccM = mean(mccL)
        # Precision
        pres = precision_score(y_test_fold, y_pred_fold)
        presL.append(pres)
        presM = mean(presL)
        # Recall
        recall = recall_score(y_test_fold, y_pred_fold)
        recallL.append(recall)
        recallM = mean(recallL)            
        # Accuracy
        accu = accuracy_score(y_test_fold, y_pred_fold)
        accuL.append(accu)            
        accuM = mean(accuL)
        # ROC_AUC
        roc_auc = roc_auc_score(y_test_fold, y_pred_fold)
        roc_aucL.append(roc_auc)            
        roc_aucM = mean(roc_aucL)    
        clf_scores_df = clf_scores_df.append({'Model': clf_name 
                                              ,'F1_score'  : fscoreM
                                              , 'MCC'      : mccM
                                              , 'Precision': presM
                                              , 'Recall'   : recallM
                                              , 'Accuracy' : accuM
                                              , 'ROC_curve': roc_aucM}
                                             , ignore_index = True)
    #scores_df = scores_df.append(clf_scores_df)
 #%% Call functions
 tN_res = MultClassPipeline(X_trainN, X_testN, y_trainN, y_testN)
 tN_res
 t2_res = MultClassPipeline2(X_train, X_test, y_train, y_test, input_df = all_features_df)
 t2_res
 #CHECK: numbers are awfully close to each other!
 t3_res = MultClassPipeSKF(input_df = numerical_features_df
                          , y_targetF = target1
                          , var_type = 'numerical'
                          , skf_splits = 10)
 t3_res
 #CHECK: numbers are awfully close to each other!
 t4_res = MultClassPipeSKF(input_df = all_features_df
                          , y_targetF = target1
                          , var_type = 'mixed'
                          , skf_splits = 10)
 t4_res    
--- a/earlier_versions/testing_lazypredict_p1.py
+++ b/earlier_versions/testing_lazypredict_p1.py
@ -0,0 +1,39 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Mon Mar 14 10:46:44 2022
@author: tanu
 """
 # Link: https://laptrinhx.com/how-to-run-30-machine-learning-models-with-2-lines-of-code-1521663246/
 import pyforest
 import warnings
 warnings.filterwarnings("ignore")
 from sklearn import metrics
 from sklearn.metrics import accuracy_score
 import lazypredict
 from lazypredict.Supervised import LazyClassifier
 #%%
 target = target1
 #target = target3
 X_trainN, X_testN, y_trainN, y_testN = train_test_split(numerical_features_df, 
                                                    target, 
                                                    test_size = 0.33, 
                                                    random_state = 42)
 #%%
 clf = LazyClassifier(verbose=0,ignore_warnings=True)
 modelsN, predictionsN = clf.fit(X_trainN, X_testN, y_trainN, y_testN)
 mm_lpN = modelsN
 #%%
 # DOESN't work as need to incorporate pipeline(one hot encoder)
 models, predictions = clf.fit(X_train, X_test, y_train, y_test)
 mm_lp = models
 model1 = Pipeline(steps = [('preprocess', MinMaxScaler())
                                , ('multiModels', clf) ])
 models, predictions = model1.fit(X_trainN, X_testN, y_trainN, y_testN)
--- a/hyperparams.py
+++ b/hyperparams.py
--- a/hyperparams_p1.py
+++ b/hyperparams_p1.py