modified loopity and multclass3 to have skf_cv as a parameters for cv

2022-03-17 18:17:58 +00:00 · 2022-03-17 18:17:58 +00:00 · d0c329a1d9
commit d0c329a1d9
parent 97620c1bb0
8 changed files with 161 additions and 127 deletions
--- a/MultClassPipe3.py
+++ b/MultClassPipe3.py
@ -61,23 +61,39 @@ from imblearn.combine import SMOTEENN
 from imblearn.under_sampling import EditedNearestNeighbours
 #%%
-rs = {'random_state': 42}
+# rs = {'random_state': 42}
-# Done: add preprocessing step with one hot encoder
+# njobs = {'n_jobs': 10}
 # Done: get accuracy and other scores through K-fold stratified cv
-scoring_fn =  ({ 'fscore'     : make_scorer(f1_score)
+scoring_fn =  ({ 'fscore'       : make_scorer(f1_score)
-                 , 'mcc'        : make_scorer(matthews_corrcoef)
+                  , 'mcc'        : make_scorer(matthews_corrcoef)
-                 , 'precision' : make_scorer(precision_score)
+                  , 'precision'  : make_scorer(precision_score)
-                 , 'recall'    : make_scorer(recall_score)
+                  , 'recall'     : make_scorer(recall_score)
-                 , 'accuracy'      : make_scorer(accuracy_score)
+                  , 'accuracy'   : make_scorer(accuracy_score)
-                 ,  'roc_auc'   : make_scorer(roc_auc_score)
+                  ,  'roc_auc'   : make_scorer(roc_auc_score)
-                 #,  'jaccard'   : make_scorer(jaccard_score)
+                  #,  'jaccard'   : make_scorer(jaccard_score)
            })    
 # Multiple Classification - Model Pipeline
-def MultClassPipelineCV(X_train, X_test, y_train, y_test, input_df, var_type = ['numerical', 'categorical','mixed']):
+def MultClassPipeSKFCV(input_df, target, skf_cv, var_type = ['numerical', 'categorical','mixed']):
    '''
    @ param input_df: input features 
    @ type: df with input features WITHOUT the target variable
    @param target: target (or output) feature
    @type: df or np.array or Series
    @param skv_cv: stratifiedK fold int or object to allow shuffle and random state to pass
    @type: int or StratifiedKfold()
    @var_type: numerical, categorical and mixed to determine what col_transform to apply (MinMaxScalar and/or one-ho    t encoder)
    @type: list
    returns
    Dict containing multiple classification scores for each model and mean of each Stratified Kfold including training
    '''
    # determine categorical and numerical features
    numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
    numerical_ix
@ -98,66 +114,61 @@ def MultClassPipelineCV(X_train, X_test, y_train, y_test, input_df, var_type = [
    col_transform = ColumnTransformer(transformers = t
                                       , remainder='passthrough')
-    #%%
+    #%% Specify multiple Classification models
    log_reg = LogisticRegression(**rs)
-    nb = BernoulliNB()
+    nb      = BernoulliNB()
-    knn = KNeighborsClassifier()
+    knn     = KNeighborsClassifier()
-    svm = SVC(**rs)
+    svm     = SVC(**rs)
-    mlp = MLPClassifier(max_iter=500, **rs)
+    mlp     = MLPClassifier(max_iter = 500, **rs)
-    dt = DecisionTreeClassifier(**rs)
+    dt      = DecisionTreeClassifier(**rs)
-    et = ExtraTreesClassifier(**rs)
+    et      = ExtraTreesClassifier(**rs)
-    rf = RandomForestClassifier(**rs)
+    rf      = RandomForestClassifier(**rs)
-    rf2 = RandomForestClassifier(
+    rf2     = RandomForestClassifier(
-                          min_samples_leaf=50,
+                          min_samples_leaf = 50
-                          n_estimators=150,
+                          , n_estimators     = 150
-                          bootstrap=True,
+                          , bootstrap        = True
-                          oob_score=True,
+                          , oob_score        = True
-                          n_jobs=-1,
+                          , **njobs
-                          random_state=42,
+                          , **rs
-                          max_features='auto')
+                          , max_features     = 'auto')
-    
+    xgb = XGBClassifier(**rs
-    xgb = XGBClassifier(**rs, verbosity=0)
+                        , verbosity = 0, use_label_encoder =False)
-    models = [
+    models = [('Logistic Regression', log_reg)
-            ('Logistic Regression', log_reg), 
+            , ('Naive Bayes'        , nb)
-            ('Naive Bayes', nb),
+            , ('K-Nearest Neighbors', knn) 
-            ('K-Nearest Neighbors', knn), 
+            , ('SVM'                , svm) 
-            ('SVM', svm), 
+            , ('MLP'                , mlp) 
-            ('MLP', mlp), 
+            , ('Decision Tree'      , dt) 
-            ('Decision Tree', dt), 
+            , ('Extra Trees'        , et) 
-            ('Extra Trees', et), 
+            , ('Random Forest'      , rf) 
-            ('Random Forest', rf), 
+            , ('Naive Bayes'        , nb)
-            ('Random Forest2', rf2), 
+            , ('Random Forest2'     , rf2) 
-            #('XGBoost', xgb)
+            , ('XGBoost'            , xgb)]
-            ]
+        
-            
+    mm_skf_scoresD = {}
    skf_cv_scores = {}
    for model_name, model_fn in models:
        print('\nModel_name:', model_name
        , '\nModel func:'    , model_fn
        , '\nList of models:', models)
    #    model_pipeline = Pipeline([
    #        ('pre'     , MinMaxScaler())
    #        , ('model'  , model_fn)])
        model_pipeline = Pipeline([
            ('prep'     , col_transform)
-            , ('model' , model_fn)])
+            , ('model'  , model_fn)])
        print('Running model pipeline:', model_pipeline)
-        skf_cv = cross_validate(model_pipeline
+        skf_cv_mod = cross_validate(model_pipeline
-                              , X_train
+                              , input_df
-                              , y_train
+                              , target
-                              , cv = 10
+                              , cv = skf_cv
                              , scoring = scoring_fn
                              , return_train_score = True)
-        skf_cv_scores[model_name] = {}
+        mm_skf_scoresD[model_name] = {}
-        for key, value in skf_cv.items():
+        for key, value in skf_cv_mod.items():
            print('\nkey:', key, '\nvalue:', value)
            print('\nmean value:', mean(value))
-            skf_cv_scores[model_name][key] = round(mean(value),2)
+            mm_skf_scoresD[model_name][key] = round(mean(value),2)
-            #pp.pprint(skf_cv_scores)
+            #pp.pprint(mm_skf_scoresD)
-    return(skf_cv_scores)
+    return(mm_skf_scoresD)
--- a/MultClassPipe3_CALL.py
+++ b/MultClassPipe3_CALL.py
@ -5,29 +5,19 @@ Created on Tue Mar 15 11:09:50 2022
@author: tanu
 """
-# stratified shuffle split
+#%% Data
-X_train, X_test, y_train, y_test = train_test_split(num_df_wtgt[numerical_FN]
+X = all_df_wtgt[numerical_FN+categorical_FN]
-                                                    , num_df_wtgt['mutation_class']
+y = all_df_wtgt['mutation_class']
-                                                    , test_size = 0.33
+#%% variables
                                                    , **rs
                                                    , shuffle = True
                                                    , stratify = num_df_wtgt['mutation_class'])
-y_train.to_frame().value_counts().plot(kind = 'bar')
+#%% MultClassPipeSKFCV: function call()
-y_test.to_frame().value_counts().plot(kind = 'bar')
+mm_skf_scoresD = MultClassPipeSKFCV(input_df = X
-
+                                        , target = y
-MultClassPipelineCV(X_train, X_test, y_train, y_test
+                                        , var_type = 'mixed'
-         , input_df = num_df_wtgt[numerical_FN]
+                                        , skf_cv = skf_cv)
         , var_type = 'numerical')
-skf_cv_scores = MultClassPipelineCV(X_train, X_test, y_train, y_test
+mm_skf_scores_df_all = pd.DataFrame(mm_skf_scoresD)
-         , input_df = num_df_wtgt[numerical_FN]
+mm_skf_scores_df_all
-         , var_type = 'numerical')
+mm_skf_scores_df_test = mm_skf_scores_df_all.filter(like='test_', axis=0)
-
+mm_skf_scores_df_train = mm_skf_scores_df_all.filter(like='train_', axis=0) # helps to see if you trust the results
 pp.pprint(skf_cv_scores)
 # construct a df
 skf_cv_scores_df = pd.DataFrame(skf_cv_scores)
 skf_cv_scores_df
 skf_cv_scores_df_test = skf_cv_scores_df.filter(like='test_', axis=0)
 skf_cv_scores_df_train = skf_cv_scores_df.filter(like='train_', axis=0)
--- a/pycache/MultClassPipe3.cpython-37.pyc
+++ b/pycache/MultClassPipe3.cpython-37.pyc
--- a/pycache/loopity_loop.cpython-37.pyc
+++ b/pycache/loopity_loop.cpython-37.pyc
--- a/base_estimator.py
+++ b/base_estimator.py
@ -138,6 +138,14 @@ parameters = [
        #'tfidf__stop_words': [None],
        'clf__estimator__alpha': (1e-2, 1e-3, 1e-1),
    },
    {
        'clf__estimator': [LogisticRegression()],
        'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
        'penalty': ['none', 'l1', 'l2', 'elasticnet'],
        'max_iter': list(range(100,800,100)),
        'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    },
 ]
 pipeline = Pipeline([
--- a/imports.py
+++ b/imports.py
@ -17,8 +17,12 @@ from sklearn.neighbors import KNeighborsClassifier
 from sklearn.svm import SVC
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
 from sklearn.ensemble import AdaBoostClassifier
 from sklearn.ensemble import GradientBoostingClassifier
 from sklearn.neural_network import MLPClassifier
 from xgboost import XGBClassifier
 from sklearn.naive_bayes import MultinomialNB
 from sklearn.linear_model import SGDClassifier
 from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
 from sklearn.compose import ColumnTransformer
@ -52,11 +56,29 @@ from imblearn.over_sampling import RandomOverSampler
 from imblearn.over_sampling import SMOTE
 from imblearn.pipeline import Pipeline
 #from sklearn.datasets import make_classification
-from sklearn.model_selection import cross_validate
+from sklearn.model_selection import cross_validate, cross_val_score
 from sklearn.model_selection import RepeatedStratifiedKFold
 from sklearn.ensemble import AdaBoostClassifier
 from imblearn.combine import SMOTEENN
 from imblearn.under_sampling import EditedNearestNeighbours
 from sklearn.model_selection import GridSearchCV
 from sklearn.base import BaseEstimator
 scoring_fn =  ({'accuracy'      : make_scorer(accuracy_score)
                 , 'fscore'     : make_scorer(f1_score)
                 , 'mcc'        : make_scorer(matthews_corrcoef)
                 ,  'precision' : make_scorer(precision_score)
                 ,  'recall'    : make_scorer(recall_score)
                 ,  'roc_auc'   : make_scorer(roc_auc_score)
            }) 
 rs = {'random_state': 42}
 njobs = {'n_jobs': 10}
 skf_cv = StratifiedKFold(n_splits = 10
                          #, shuffle = False, random_state= None)
                           , shuffle = True,**rs)
 #%%
 homedir = os.path.expanduser("~")
 os.chdir(homedir + "/git/ML_AI_training/")
@ -64,8 +86,8 @@ os.chdir(homedir + "/git/ML_AI_training/")
 # my function
 from MultClassPipe import MultClassPipeline
 from MultClassPipe2 import MultClassPipeline2
-from loopity_loop import MultClassPipeSKF
+from loopity_loop import MultClassPipeSKFLoop
-from MultClassPipe3 import MultClassPipelineCV
+from MultClassPipe3 import MultClassPipeSKFCV
 gene = 'pncA'
@ -199,3 +221,16 @@ cat_df_wtgt.shape
 all_df_wtgt = my_df[numerical_FN + categorical_FN + ['mutation_class']]
 all_df_wtgt.shape
 #%%
 #%% Get train-test split and scoring functions
 X = num_df_wtgt[numerical_FN]
 y = num_df_wtgt['mutation_class']
 X_train, X_test, y_train, y_test = train_test_split(X
                                            ,y
                                            , test_size    = 0.33
                                            , random_state = 2
                                            , shuffle      = True
                                            , stratify     = y)
--- a/loopity_loop.py
+++ b/loopity_loop.py
@ -33,23 +33,30 @@ from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoe
 from statistics import mean, stdev, median, mode
 #%%
 rs = {'random_state': 42}
 njobs = {'n_jobs': 10}
 # Done: add preprocessing step with one hot encoder
-# TODO: supply stratified K-fold cv train and test data
+# TODO: supply stratified K-fold cv train and test dataskf
 # TODO: get accuracy and other scores through K-fold cv
 # Multiple Classification - Model Pipeline
-def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical','mixed'], skf_splits = 10):
+def MultClassPipeSKFLoop(input_df, target, skf_cv, var_type = ['numerical','categorical','mixed']):
    '''
    @ param input_df: input features 
-    @ type: df (gets converted to np.array for stratified Kfold, and helps identify names to apply column transformation)
+    @ type: df with input features WITHOUT the target variable
-    @param y_outputF: target (or output) feature
+    @param target: target (or output) feature
-    @type: df or np.array
+    @type: df or np.array or Series
    @param skv_cv: stratifiedK fold int or object to allow shuffle and random state to pass
    @type: int or StratifiedKfold()
    @var_type: numerical, categorical and mixed to determine what col_transform to apply (MinMaxScalar and/or one-hot encoder)
    @type: list
    returns
-    multiple classification model scores
+    Dict containing multiple classification scores for each model and each Stratified Kfold
    '''
    # Determine categorical and numerical features
@ -86,17 +93,17 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical'
                          , n_estimators     = 150
                          , bootstrap        = True
                          , oob_score        = True
-                          , n_jobs           = -1
+                          , **njobs
                          , **rs
                          , max_features     = 'auto')
-    xgb = XGBClassifier(**rs, verbosity = 0)
+    xgb = XGBClassifier(**rs, verbosity = 0, use_label_encoder = False)
    classification_metrics = {
        'F1_score': []
        ,'MCC': []
        ,'Precision': []
        ,'Recall': []
-        ,'Accuracy': []
+        , 'Accuracy': []
        ,'ROC_AUC': []
        }
    models = [
@ -109,33 +116,29 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical'
             , ('Extra Trees'        , et) 
             , ('Random Forest'      , rf) 
             , ('Naive Bayes'        , nb)
-
+             , ('Random Forest2'     , rf2) 
-            , ('Random Forest2'     , rf2) 
+             , ('XGBoost'            , xgb)
            #, ('XGBoost'            , xgb)
            ]
-    skf = StratifiedKFold(n_splits = skf_splits
+    # skf = StratifiedKFold(n_splits = 10
-                          , shuffle = True
+    #                       #, shuffle = False, random_state= None)
-                          , **rs)
+    #                       , shuffle = True,**rs)
 #    skf_dict = {}
    fold_no = 1
    fold_dict={}
    for model_name, model in models:
        fold_dict.update({ model_name: {}})
    #scores_df = pd.DataFrame()
-    for train_index, test_index in skf.split(input_df, y_targetF):
+    for train_index, test_index in skf_cv.split(input_df, target):
        x_train_fold, x_test_fold = input_df.iloc[train_index], input_df.iloc[test_index]
-        y_train_fold, y_test_fold = y_targetF.iloc[train_index], y_targetF.iloc[test_index]
+        y_train_fold, y_test_fold = target.iloc[train_index], target.iloc[test_index]
        #print("Fold: ", fold_no, len(train_index), len(test_index))
        for model_name, model in models:
            print("\nStart of model", model_name, "\nLoop no.", fold_no)
-            #skf_dict.update({model_name: classification_metrics })
+            model_pipeline = Pipeline(steps=[('prep'          , col_transform)
            model_pipeline = Pipeline(steps=[('prep'         , col_transform)
                                              , ('classifier' , model)])
            model_pipeline.fit(x_train_fold, y_train_fold)
            y_pred_fold  = model_pipeline.predict(x_test_fold)
@ -168,14 +171,4 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical'
            fold_dict[model_name][fold].update({'ROC_AUC'   : roc_auc})
        fold_no +=1
-        #pp.pprint(skf_dict)
+    return(fold_dict)
    return(fold_dict)
 #%% CAll function 
 # t3_res = MultClassPipeSKF(input_df = numerical_features_df
 #                           , y_targetF = target1
 #                           , var_type = 'numerical'
 #                           , skf_splits = 10)
 # pp.pprint(t3_res)
 # #print(t3_res)
--- a/loopity_loop_CALL.py
+++ b/loopity_loop_CALL.py
@ -5,22 +5,19 @@ Created on Fri Mar 11 11:15:50 2022
@author: tanu
 """
-#%%
+#%% variables
-del(t3_res)
+rs = {'random_state': 42}
 # t3_res = MultClassPipeSKF(input_df = numerical_features_df
 #                           , y_targetF = target1
 #                           , var_type = 'numerical'
 #                           , skf_splits = 10)
 # pp.pprint(t3_res)
 # #print(t3_res)
-t3_res = MultClassPipeSKF(input_df = num_df_wtgt[numerical_FN]
+skf_cv = StratifiedKFold(n_splits = 10
-                          , y_targetF = num_df_wtgt['mutation_class']
+                          #, shuffle = False, random_state= None)
                          , shuffle = True,**rs)
 #%% MultClassPipeSKFLoop: function call()
 t3_res = MultClassPipeSKFLoop(input_df = num_df_wtgt[numerical_FN]
                          , target = num_df_wtgt['mutation_class']
                          , var_type = 'numerical'
-                          , skf_splits = 10)
+                          , skf_cv = skf_cv)
 pp.pprint(t3_res)
 #print(t3_res)
 ################################################################
 # extract items from wwithin a nested dict
 #%% Classification Metrics we need to mean()