modified loopity and multclass3 to have skf_cv as a parameters for cv

2022-03-17 18:17:58 +00:00 · 2022-03-17 18:17:58 +00:00 · d0c329a1d9
commit d0c329a1d9
parent 97620c1bb0
8 changed files with 161 additions and 127 deletions
--- a/MultClassPipe3.py
+++ b/MultClassPipe3.py
@ -61,23 +61,39 @@ from imblearn.combine import SMOTEENN
 from imblearn.under_sampling import EditedNearestNeighbours

 #%%
-rs = {'random_state': 42}
-# Done: add preprocessing step with one hot encoder
-# Done: get accuracy and other scores through K-fold stratified cv
+# rs = {'random_state': 42}
+# njobs = {'n_jobs': 10}

-scoring_fn =  ({ 'fscore'     : make_scorer(f1_score)
-                 , 'mcc'        : make_scorer(matthews_corrcoef)
-                 , 'precision' : make_scorer(precision_score)
-                 , 'recall'    : make_scorer(recall_score)
-                 , 'accuracy'      : make_scorer(accuracy_score)
-                 ,  'roc_auc'   : make_scorer(roc_auc_score)
-                 #,  'jaccard'   : make_scorer(jaccard_score)
+scoring_fn =  ({ 'fscore'       : make_scorer(f1_score)
+                  , 'mcc'        : make_scorer(matthews_corrcoef)
+                  , 'precision'  : make_scorer(precision_score)
+                  , 'recall'     : make_scorer(recall_score)
+                  , 'accuracy'   : make_scorer(accuracy_score)
+                  ,  'roc_auc'   : make_scorer(roc_auc_score)
+                  #,  'jaccard'   : make_scorer(jaccard_score)
            })    


 # Multiple Classification - Model Pipeline
-def MultClassPipelineCV(X_train, X_test, y_train, y_test, input_df, var_type = ['numerical', 'categorical','mixed']):
+def MultClassPipeSKFCV(input_df, target, skf_cv, var_type = ['numerical', 'categorical','mixed']):

+    '''
+    @ param input_df: input features 
+    @ type: df with input features WITHOUT the target variable
+    
+    @param target: target (or output) feature
+    @type: df or np.array or Series
+    
+    @param skv_cv: stratifiedK fold int or object to allow shuffle and random state to pass
+    @type: int or StratifiedKfold()
+    
+    @var_type: numerical, categorical and mixed to determine what col_transform to apply (MinMaxScalar and/or one-ho    t encoder)
+    @type: list
+
+    returns
+    Dict containing multiple classification scores for each model and mean of each Stratified Kfold including training
+       
+    '''
    # determine categorical and numerical features
    numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
    numerical_ix
@ -98,66 +114,61 @@ def MultClassPipelineCV(X_train, X_test, y_train, y_test, input_df, var_type = [
    col_transform = ColumnTransformer(transformers = t
                                       , remainder='passthrough')
    
-    #%%
+    #%% Specify multiple Classification models
    log_reg = LogisticRegression(**rs)
-    nb = BernoulliNB()
-    knn = KNeighborsClassifier()
-    svm = SVC(**rs)
-    mlp = MLPClassifier(max_iter=500, **rs)
-    dt = DecisionTreeClassifier(**rs)
-    et = ExtraTreesClassifier(**rs)
-    rf = RandomForestClassifier(**rs)
-    rf2 = RandomForestClassifier(
-                          min_samples_leaf=50,
-                          n_estimators=150,
-                          bootstrap=True,
-                          oob_score=True,
-                          n_jobs=-1,
-                          random_state=42,
-                          max_features='auto')
-    
-    xgb = XGBClassifier(**rs, verbosity=0)
+    nb      = BernoulliNB()
+    knn     = KNeighborsClassifier()
+    svm     = SVC(**rs)
+    mlp     = MLPClassifier(max_iter = 500, **rs)
+    dt      = DecisionTreeClassifier(**rs)
+    et      = ExtraTreesClassifier(**rs)
+    rf      = RandomForestClassifier(**rs)
+    rf2     = RandomForestClassifier(
+                          min_samples_leaf = 50
+                          , n_estimators     = 150
+                          , bootstrap        = True
+                          , oob_score        = True
+                          , **njobs
+                          , **rs
+                          , max_features     = 'auto')
+    xgb = XGBClassifier(**rs
+                        , verbosity = 0, use_label_encoder =False)

-    models = [
-            ('Logistic Regression', log_reg), 
-            ('Naive Bayes', nb),
-            ('K-Nearest Neighbors', knn), 
-            ('SVM', svm), 
-            ('MLP', mlp), 
-            ('Decision Tree', dt), 
-            ('Extra Trees', et), 
-            ('Random Forest', rf), 
-            ('Random Forest2', rf2), 
-            #('XGBoost', xgb)
-            ]
-            
-    skf_cv_scores = {}
+    models = [('Logistic Regression', log_reg)
+            , ('Naive Bayes'        , nb)
+            , ('K-Nearest Neighbors', knn) 
+            , ('SVM'                , svm) 
+            , ('MLP'                , mlp) 
+            , ('Decision Tree'      , dt) 
+            , ('Extra Trees'        , et) 
+            , ('Random Forest'      , rf) 
+            , ('Naive Bayes'        , nb)
+            , ('Random Forest2'     , rf2) 
+            , ('XGBoost'            , xgb)]
+        
+    mm_skf_scoresD = {}
     
    for model_name, model_fn in models:
        print('\nModel_name:', model_name
        , '\nModel func:'    , model_fn
        , '\nList of models:', models)
    
-    #    model_pipeline = Pipeline([
-    #        ('pre'     , MinMaxScaler())
-    #        , ('model'  , model_fn)])
-            
        model_pipeline = Pipeline([
            ('prep'     , col_transform)
-            , ('model' , model_fn)])
+            , ('model'  , model_fn)])
            
        print('Running model pipeline:', model_pipeline)
-        skf_cv = cross_validate(model_pipeline
-                              , X_train
-                              , y_train
-                              , cv = 10
+        skf_cv_mod = cross_validate(model_pipeline
+                              , input_df
+                              , target
+                              , cv = skf_cv
                              , scoring = scoring_fn
                              , return_train_score = True)
-        skf_cv_scores[model_name] = {}
-        for key, value in skf_cv.items():
+        mm_skf_scoresD[model_name] = {}
+        for key, value in skf_cv_mod.items():
            print('\nkey:', key, '\nvalue:', value)
            print('\nmean value:', mean(value))
-            skf_cv_scores[model_name][key] = round(mean(value),2)
-            #pp.pprint(skf_cv_scores)
-    return(skf_cv_scores)
+            mm_skf_scoresD[model_name][key] = round(mean(value),2)
+            #pp.pprint(mm_skf_scoresD)
+    return(mm_skf_scoresD)

--- a/MultClassPipe3_CALL.py
+++ b/MultClassPipe3_CALL.py
@ -5,29 +5,19 @@ Created on Tue Mar 15 11:09:50 2022

@author: tanu
 """
-# stratified shuffle split
-X_train, X_test, y_train, y_test = train_test_split(num_df_wtgt[numerical_FN]
-                                                    , num_df_wtgt['mutation_class']
-                                                    , test_size = 0.33
-                                                    , **rs
-                                                    , shuffle = True
-                                                    , stratify = num_df_wtgt['mutation_class'])
+#%% Data
+X = all_df_wtgt[numerical_FN+categorical_FN]
+y = all_df_wtgt['mutation_class']
+#%% variables

-y_train.to_frame().value_counts().plot(kind = 'bar')
-y_test.to_frame().value_counts().plot(kind = 'bar')
-
-MultClassPipelineCV(X_train, X_test, y_train, y_test
-         , input_df = num_df_wtgt[numerical_FN]
-         , var_type = 'numerical')
+#%% MultClassPipeSKFCV: function call()
+mm_skf_scoresD = MultClassPipeSKFCV(input_df = X
+                                        , target = y
+                                        , var_type = 'mixed'
+                                        , skf_cv = skf_cv)


-skf_cv_scores = MultClassPipelineCV(X_train, X_test, y_train, y_test
-         , input_df = num_df_wtgt[numerical_FN]
-         , var_type = 'numerical')
-
-pp.pprint(skf_cv_scores)
-# construct a df
-skf_cv_scores_df = pd.DataFrame(skf_cv_scores)
-skf_cv_scores_df
-skf_cv_scores_df_test = skf_cv_scores_df.filter(like='test_', axis=0)
-skf_cv_scores_df_train = skf_cv_scores_df.filter(like='train_', axis=0)
+mm_skf_scores_df_all = pd.DataFrame(mm_skf_scoresD)
+mm_skf_scores_df_all
+mm_skf_scores_df_test = mm_skf_scores_df_all.filter(like='test_', axis=0)
+mm_skf_scores_df_train = mm_skf_scores_df_all.filter(like='train_', axis=0) # helps to see if you trust the results
--- a/pycache/MultClassPipe3.cpython-37.pyc
+++ b/pycache/MultClassPipe3.cpython-37.pyc
--- a/pycache/loopity_loop.cpython-37.pyc
+++ b/pycache/loopity_loop.cpython-37.pyc
--- a/base_estimator.py
+++ b/base_estimator.py
@ -138,6 +138,14 @@ parameters = [
        #'tfidf__stop_words': [None],
        'clf__estimator__alpha': (1e-2, 1e-3, 1e-1),
    },
+    
+    {
+        'clf__estimator': [LogisticRegression()],
+        'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
+        'penalty': ['none', 'l1', 'l2', 'elasticnet'],
+        'max_iter': list(range(100,800,100)),
+        'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
+    },
 ]

 pipeline = Pipeline([
--- a/imports.py
+++ b/imports.py
@ -17,8 +17,12 @@ from sklearn.neighbors import KNeighborsClassifier
 from sklearn.svm import SVC
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
+from sklearn.ensemble import AdaBoostClassifier
+from sklearn.ensemble import GradientBoostingClassifier
 from sklearn.neural_network import MLPClassifier
 from xgboost import XGBClassifier
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.linear_model import SGDClassifier
 from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder

 from sklearn.compose import ColumnTransformer
@ -52,11 +56,29 @@ from imblearn.over_sampling import RandomOverSampler
 from imblearn.over_sampling import SMOTE
 from imblearn.pipeline import Pipeline
 #from sklearn.datasets import make_classification
-from sklearn.model_selection import cross_validate
+from sklearn.model_selection import cross_validate, cross_val_score
 from sklearn.model_selection import RepeatedStratifiedKFold
 from sklearn.ensemble import AdaBoostClassifier
 from imblearn.combine import SMOTEENN
 from imblearn.under_sampling import EditedNearestNeighbours
+
+from sklearn.model_selection import GridSearchCV
+from sklearn.base import BaseEstimator
+
+scoring_fn =  ({'accuracy'      : make_scorer(accuracy_score)
+                 , 'fscore'     : make_scorer(f1_score)
+                 , 'mcc'        : make_scorer(matthews_corrcoef)
+                 ,  'precision' : make_scorer(precision_score)
+                 ,  'recall'    : make_scorer(recall_score)
+                 ,  'roc_auc'   : make_scorer(roc_auc_score)
+            }) 
+  
+rs = {'random_state': 42}
+njobs = {'n_jobs': 10}
+skf_cv = StratifiedKFold(n_splits = 10
+                          #, shuffle = False, random_state= None)
+                           , shuffle = True,**rs)
+
 #%%
 homedir = os.path.expanduser("~")
 os.chdir(homedir + "/git/ML_AI_training/")
@ -64,8 +86,8 @@ os.chdir(homedir + "/git/ML_AI_training/")
 # my function
 from MultClassPipe import MultClassPipeline
 from MultClassPipe2 import MultClassPipeline2
-from loopity_loop import MultClassPipeSKF
-from MultClassPipe3 import MultClassPipelineCV
+from loopity_loop import MultClassPipeSKFLoop
+from MultClassPipe3 import MultClassPipeSKFCV


 gene = 'pncA'
@ -199,3 +221,16 @@ cat_df_wtgt.shape

 all_df_wtgt = my_df[numerical_FN + categorical_FN + ['mutation_class']]
 all_df_wtgt.shape
+
+#%%
+#%% Get train-test split and scoring functions
+X = num_df_wtgt[numerical_FN]
+y = num_df_wtgt['mutation_class']
+
+X_train, X_test, y_train, y_test = train_test_split(X
+                                            ,y
+                                            , test_size    = 0.33
+                                            , random_state = 2
+                                            , shuffle      = True
+                                            , stratify     = y)
+ 
--- a/loopity_loop.py
+++ b/loopity_loop.py
@ -33,23 +33,30 @@ from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoe
 from statistics import mean, stdev, median, mode
 #%%
 rs = {'random_state': 42}
+njobs = {'n_jobs': 10}
+   
 # Done: add preprocessing step with one hot encoder
-# TODO: supply stratified K-fold cv train and test data
+# TODO: supply stratified K-fold cv train and test dataskf
 # TODO: get accuracy and other scores through K-fold cv

 # Multiple Classification - Model Pipeline
-def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical','mixed'], skf_splits = 10):
+def MultClassPipeSKFLoop(input_df, target, skf_cv, var_type = ['numerical','categorical','mixed']):

    '''
    @ param input_df: input features 
-    @ type: df (gets converted to np.array for stratified Kfold, and helps identify names to apply column transformation)
+    @ type: df with input features WITHOUT the target variable
    
-    @param y_outputF: target (or output) feature
-    @type: df or np.array
+    @param target: target (or output) feature
+    @type: df or np.array or Series
    
+    @param skv_cv: stratifiedK fold int or object to allow shuffle and random state to pass
+    @type: int or StratifiedKfold()
+    
+    @var_type: numerical, categorical and mixed to determine what col_transform to apply (MinMaxScalar and/or one-hot encoder)
+    @type: list

    returns
-    multiple classification model scores
+    Dict containing multiple classification scores for each model and each Stratified Kfold
       
    '''
    # Determine categorical and numerical features
@ -86,17 +93,17 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical'
                          , n_estimators     = 150
                          , bootstrap        = True
                          , oob_score        = True
-                          , n_jobs           = -1
+                          , **njobs
                          , **rs
                          , max_features     = 'auto')
    
-    xgb = XGBClassifier(**rs, verbosity = 0)
+    xgb = XGBClassifier(**rs, verbosity = 0, use_label_encoder = False)
    classification_metrics = {
        'F1_score': []
        ,'MCC': []
        ,'Precision': []
        ,'Recall': []
-        ,'Accuracy': []
+        , 'Accuracy': []
        ,'ROC_AUC': []
        }
    models = [
@ -109,33 +116,29 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical'
             , ('Extra Trees'        , et) 
             , ('Random Forest'      , rf) 
             , ('Naive Bayes'        , nb)
-
-            , ('Random Forest2'     , rf2) 
-            #, ('XGBoost'            , xgb)
+             , ('Random Forest2'     , rf2) 
+             , ('XGBoost'            , xgb)
            ]

-    skf = StratifiedKFold(n_splits = skf_splits
-                          , shuffle = True
-                          , **rs)
+    # skf = StratifiedKFold(n_splits = 10
+    #                       #, shuffle = False, random_state= None)
+    #                       , shuffle = True,**rs)

-#    skf_dict = {}
    fold_no = 1
    fold_dict={}

-
    for model_name, model in models:
        fold_dict.update({ model_name: {}})

    #scores_df = pd.DataFrame()
-    for train_index, test_index in skf.split(input_df, y_targetF):
+    for train_index, test_index in skf_cv.split(input_df, target):
        x_train_fold, x_test_fold = input_df.iloc[train_index], input_df.iloc[test_index]
-        y_train_fold, y_test_fold = y_targetF.iloc[train_index], y_targetF.iloc[test_index]
+        y_train_fold, y_test_fold = target.iloc[train_index], target.iloc[test_index]
        #print("Fold: ", fold_no, len(train_index), len(test_index))

        for model_name, model in models:
            print("\nStart of model", model_name, "\nLoop no.", fold_no)
-            #skf_dict.update({model_name: classification_metrics })
-            model_pipeline = Pipeline(steps=[('prep'         , col_transform)
+            model_pipeline = Pipeline(steps=[('prep'          , col_transform)
                                              , ('classifier' , model)])
            model_pipeline.fit(x_train_fold, y_train_fold)
            y_pred_fold  = model_pipeline.predict(x_test_fold)
@ -168,14 +171,4 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical'
            fold_dict[model_name][fold].update({'ROC_AUC'   : roc_auc})
            
        fold_no +=1
-        #pp.pprint(skf_dict)
-
-    return(fold_dict)
-
-#%% CAll function 
-# t3_res = MultClassPipeSKF(input_df = numerical_features_df
-#                           , y_targetF = target1
-#                           , var_type = 'numerical'
-#                           , skf_splits = 10)
-# pp.pprint(t3_res)
-# #print(t3_res)
+    return(fold_dict)
--- a/loopity_loop_CALL.py
+++ b/loopity_loop_CALL.py
@ -5,22 +5,19 @@ Created on Fri Mar 11 11:15:50 2022

@author: tanu
 """
-#%%
-del(t3_res)
-# t3_res = MultClassPipeSKF(input_df = numerical_features_df
-#                           , y_targetF = target1
-#                           , var_type = 'numerical'
-#                           , skf_splits = 10)
-# pp.pprint(t3_res)
-# #print(t3_res)
+#%% variables
+rs = {'random_state': 42}

-t3_res = MultClassPipeSKF(input_df = num_df_wtgt[numerical_FN]
-                          , y_targetF = num_df_wtgt['mutation_class']
+skf_cv = StratifiedKFold(n_splits = 10
+                          #, shuffle = False, random_state= None)
+                          , shuffle = True,**rs)
+#%% MultClassPipeSKFLoop: function call()
+t3_res = MultClassPipeSKFLoop(input_df = num_df_wtgt[numerical_FN]
+                          , target = num_df_wtgt['mutation_class']
                          , var_type = 'numerical'
-                          , skf_splits = 10)
+                          , skf_cv = skf_cv)
 pp.pprint(t3_res)
 #print(t3_res)
-
 ################################################################
 # extract items from wwithin a nested dict
 #%% Classification Metrics we need to mean()