added Mult_clfs_logo and Mult_clsf.py with consistency

2022-07-10 12:32:52 +01:00 · 2022-07-10 12:32:52 +01:00 · de5c1270be
commit de5c1270be
parent 06f2ce97b6
5 changed files with 201 additions and 115 deletions
--- a/scripts/ml/ml_functions/MultClfs.py
+++ b/scripts/ml/ml_functions/MultClfs.py
@ -74,10 +74,13 @@ from sklearn.impute import KNNImputer as KNN
 import json
 import argparse
 import re
+import itertools
+from sklearn.model_selection import LeaveOneGroupOut
 from sklearn.decomposition import PCA
+
 #%% GLOBALS
-rs = {'random_state': 42}
-njobs = {'n_jobs': os.cpu_count() } # the number of jobs should equal the number of CPU cores
+#rs = {'random_state': 42}
+#njobs = {'n_jobs': os.cpu_count() } # the number of jobs should equal the number of CPU cores

 scoring_fn =  ({ 'mcc'        : make_scorer(matthews_corrcoef)
                , 'fscore'    : make_scorer(f1_score)
@ -88,13 +91,13 @@ scoring_fn =  ({ 'mcc'        : make_scorer(matthews_corrcoef)
                , 'jcc'       : make_scorer(jaccard_score)
            }) 
  
-skf_cv = StratifiedKFold(n_splits = 10
-                          #, shuffle = False, random_state= None)
-                           , shuffle = True,**rs)
+#skf_cv = StratifiedKFold(n_splits = 10
+#                          #, shuffle = False, random_state= None)
+#                           , shuffle = True,**rs)

-rskf_cv = RepeatedStratifiedKFold(n_splits = 10
-                                  , n_repeats = 3
-                                  , **rs)
+#rskf_cv = RepeatedStratifiedKFold(n_splits = 10
+#                                  , n_repeats = 3
+#                                  , **rs)

 mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)}
 jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
@ -137,6 +140,7 @@ scoreBT_mapD = {'bts_mcc'          : 'MCC'
                , 'bts_jcc'        : 'JCC'
               }

+#gene_group = 'gene_name'
 #%%############################################################################
 ############################
 # MultModelsCl()
@ -145,17 +149,23 @@ scoreBT_mapD = {'bts_mcc'          : 'MCC'
 # Multiple Classification - Model Pipeline
 def MultModelsCl(input_df, target
                       , sel_cv
-                       , blind_test_df
-                       , blind_test_target
-                       , tts_split_type 
-
-                       , resampling_type = 'none' # default
+                       , tts_split_type
+                       , resampling_type
+                       #, group = None
+                       
                       , add_cm = True # adds confusion matrix based on cross_val_predict
                       , add_yn = True  # adds target var class numbers
-                       , var_type = ['numerical', 'categorical','mixed'] 
+                       , var_type = ['numerical', 'categorical','mixed']
                       , scale_numeric = ['min_max', 'std', 'min_max_neg', 'none'] 
+
                       , run_blind_test = True
-                       , return_formatted_output = True):
+                       , blind_test_df = pd.DataFrame()
+                       , blind_test_target = pd.Series(dtype = int)
+                       , return_formatted_output = True
+
+                       , random_state = 42
+                       , n_jobs = os.cpu_count() # the number of jobs should equal the number of CPU cores
+                       ):

    '''
    @ param input_df: input features 
@ -173,7 +183,25 @@ def MultModelsCl(input_df, target
    returns
    Dict containing multiple classification scores for each model and mean of each Stratified Kfold including training
    '''
+    
+#%% Func globals        
+    rs = {'random_state': random_state}
+    njobs = {'n_jobs': n_jobs}
+    
+    skf_cv = StratifiedKFold(n_splits = 10
+                              #, shuffle = False, random_state= None)
+                              , shuffle = True,**rs)

+    rskf_cv = RepeatedStratifiedKFold(n_splits = 10
+                                      , n_repeats = 3
+                                      , **rs)
+    logo = LeaveOneGroupOut()
+
+    # select CV type:           
+    # if group == None:
+    #     sel_cv = skf_cv
+    # else: 
+    #     sel_cv = logo
    #======================================================
    # Determine categorical and numerical features
    #======================================================
@ -196,8 +224,9 @@ def MultModelsCl(input_df, target
    # #     t = [('num', MinMaxScaler(), numerical_ix)
    # #         , ('cat', OneHotEncoder(), categorical_ix) ]

-    # if var_type == 'mixed':
-    #     t = [('cat', OneHotEncoder(), categorical_ix) ]
+    # col_transform = ColumnTransformer(transformers = t
+    #                                    , remainder='passthrough')
+    
    if type(var_type) == list: 
        var_type = str(var_type[0])
    else:
@ -229,37 +258,37 @@ def MultModelsCl(input_df, target
    #======================================================
    # Specify multiple Classification Models  
    #======================================================
-    models = [('AdaBoost Classifier'     , AdaBoostClassifier(**rs) )
-               # , ('Bagging Classifier'        , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True, verbose = 3, n_estimators = 100) )
-               # , ('Decision Tree'             , DecisionTreeClassifier(**rs) ) 
-               #  , ('Extra Tree'                , ExtraTreeClassifier(**rs) )
-               #  , ('Extra Trees'               , ExtraTreesClassifier(**rs) ) 
-               #  , ('Gradient Boosting'         , GradientBoostingClassifier(**rs) )
-               #  , ('Gaussian NB'               , GaussianNB() )
-               #  , ('Gaussian Process'          , GaussianProcessClassifier(**rs) )
-               #  , ('K-Nearest Neighbors'       , KNeighborsClassifier() ) 
-                 , ('LDA'                       , LinearDiscriminantAnalysis() )
-               # , ('Logistic Regression'       , LogisticRegression(**rs) )
-               # , ('Logistic RegressionCV'     , LogisticRegressionCV(cv = 3, **rs))
-               # , ('MLP'                       , MLPClassifier(max_iter = 500, **rs) ) 
-                #, ('Multinomial'               , MultinomialNB() )
-                # , ('Naive Bayes'               , BernoulliNB() )
-                # , ('Passive Aggresive'         , PassiveAggressiveClassifier(**rs, **njobs) )
-                # , ('QDA'                       , QuadraticDiscriminantAnalysis() )
-                # , ('Random Forest'             , RandomForestClassifier(**rs, n_estimators = 1000, **njobs ) ) 
-                # # , ('Random Forest2'            , RandomForestClassifier(min_samples_leaf = 5
-                #                                                         , n_estimators     = 1000
-                #                                                         , bootstrap        = True
-                #                                                         , oob_score        = True
-                #                                                         , **njobs
-                #                                                         , **rs
-                #                                                         , max_features     = 'auto') ) 
-             #     , ('Ridge Classifier'          , RidgeClassifier(**rs)  )
-             #     , ('Ridge ClassifierCV'        , RidgeClassifierCV(cv = 3) )          
-             #     , ('SVC'                       , SVC(**rs) ) 
-             #     , ('Stochastic GDescent'       , SGDClassifier(**rs, **njobs) )
-             #     , ('XGBoost'                   , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False, **njobs) )
-             # 
+    models = [('AdaBoost Classifier'         , AdaBoostClassifier(**rs) )
+              , ('Bagging Classifier'        , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True, verbose = 3, n_estimators = 100) )
+              , ('Decision Tree'             , DecisionTreeClassifier(**rs) ) 
+              , ('Extra Tree'                , ExtraTreeClassifier(**rs) )
+              , ('Extra Trees'               , ExtraTreesClassifier(**rs) ) 
+              , ('Gradient Boosting'         , GradientBoostingClassifier(**rs) )
+              , ('Gaussian NB'               , GaussianNB() )
+              , ('Gaussian Process'          , GaussianProcessClassifier(**rs) )
+              , ('K-Nearest Neighbors'       , KNeighborsClassifier() ) 
+              , ('LDA'                       , LinearDiscriminantAnalysis() )
+              , ('Logistic Regression'       , LogisticRegression(**rs) )
+              , ('Logistic RegressionCV'     , LogisticRegressionCV(cv = 3, **rs))
+              , ('MLP'                       , MLPClassifier(max_iter = 500, **rs) ) 
+              , ('Multinomial'               , MultinomialNB() )
+              , ('Naive Bayes'               , BernoulliNB() )
+              , ('Passive Aggresive'         , PassiveAggressiveClassifier(**rs, **njobs) )
+              , ('QDA'                       , QuadraticDiscriminantAnalysis() )
+              , ('Random Forest'             , RandomForestClassifier(**rs, n_estimators = 1000, **njobs ) ) 
+              , ('Random Forest2'            , RandomForestClassifier(min_samples_leaf = 5
+                                                                         , n_estimators     = 1000
+                                                                         , bootstrap        = True
+                                                                         , oob_score        = True
+                                                                         , **njobs
+                                                                         , **rs
+                                                                         , max_features     = 'auto') ) 
+              , ('Ridge Classifier'          , RidgeClassifier(**rs)  )
+              , ('Ridge ClassifierCV'        , RidgeClassifierCV(cv = 3) )          
+              , ('SVC'                       , SVC(**rs) ) 
+              , ('Stochastic GDescent'       , SGDClassifier(**rs, **njobs) )
+              , ('XGBoost'                   , XGBClassifier(**rs, verbosity = 0, use_label_encoder = False, **njobs) )
+              
             ]
                
    mm_skf_scoresD = {}
@ -289,10 +318,11 @@ def MultModelsCl(input_df, target
            
            
        print('\nRunning model pipeline:', model_pipeline)
-        skf_cv_modD = cross_validate(model_pipeline
+        cv_modD = cross_validate(model_pipeline
                              , input_df
                              , target
                              , cv = sel_cv
+                              #, groups = group
                              , scoring = scoring_fn
                              , return_train_score = True)
        #==============================
@ -300,7 +330,7 @@ def MultModelsCl(input_df, target
        #==============================
        mm_skf_scoresD[model_name] = {}
        
-        for key, value in skf_cv_modD.items():
+        for key, value in cv_modD.items():
            print('\nkey:', key, '\nvalue:', value)
            print('\nmean value:', np.mean(value))
            mm_skf_scoresD[model_name][key] = round(np.mean(value),2)
@ -308,7 +338,7 @@ def MultModelsCl(input_df, target
        # ADD more info: meta data related to input df
        mm_skf_scoresD[model_name]['resampling']        = resampling_type
        mm_skf_scoresD[model_name]['n_training_size']   = len(input_df)
-        #mm_skf_scoresD[model_name]['n_trainingY_ratio'] = round(Counter(target)[0]/Counter(target)[1], 2)
+        mm_skf_scoresD[model_name]['n_trainingY_ratio'] = round(Counter(target)[0]/Counter(target)[1], 2)
        mm_skf_scoresD[model_name]['n_features']        = len(input_df.columns)
        mm_skf_scoresD[model_name]['tts_split']         = tts_split_type
        
@ -321,7 +351,12 @@ def MultModelsCl(input_df, target
           cmD = {}

            # Calculate cm         
-           y_pred   = cross_val_predict(model_pipeline, input_df, target, cv = sel_cv, **njobs)
+           y_pred   = cross_val_predict(model_pipeline
+                                        , input_df
+                                        , target
+                                        , cv = sel_cv
+                                        #, groups = group
+                                        , **njobs)
            #_tn, _fp, _fn, _tp = confusion_matrix(y_pred, y).ravel() # internally
           tn, fp, fn, tp = confusion_matrix(y_pred, target).ravel()
    
@ -357,7 +392,7 @@ def MultModelsCl(input_df, target
           # Build bts numbers dict
           btD = {'n_blindY_neg'    : Counter(blind_test_target)[0]
                  , 'n_blindY_pos'  : Counter(blind_test_target)[1]
-                  #, 'n_testY_ratio' : round(Counter(blind_test_target)[0]/Counter(blind_test_target)[1], 2)
+                  , 'n_testY_ratio' : round(Counter(blind_test_target)[0]/Counter(blind_test_target)[1], 2)
                  , 'n_test_size'   : len(blind_test_df) }
           
           # Update cmD+tnD dicts with btD
@ -371,9 +406,9 @@ def MultModelsCl(input_df, target
           bts_predict = model_pipeline.predict(blind_test_df)
           
           bts_mcc_score = round(matthews_corrcoef(blind_test_target, bts_predict),2)
-           print('\nMCC on Blind test:'     , bts_mcc_score)
+           print('\nMCC on Blind test:'      , bts_mcc_score)
           #print('\nAccuracy on Blind test:', round(accuracy_score(blind_test_target, bts_predict),2))
-           print('\nMCC on Training:'      , mm_skf_scoresD[model_name]['test_mcc'] )
+           print('\nMCC on Training:'        , mm_skf_scoresD[model_name]['test_mcc'] )

           mm_skf_scoresD[model_name]['bts_mcc']       = bts_mcc_score
           mm_skf_scoresD[model_name]['bts_fscore']    = round(f1_score(blind_test_target, bts_predict),2)
@ -384,7 +419,7 @@ def MultModelsCl(input_df, target
           mm_skf_scoresD[model_name]['bts_jcc']       = round(jaccard_score(blind_test_target, bts_predict),2)
           #mm_skf_scoresD[model_name]['diff_mcc']      = train_test_diff_MCC

-           
+
    #return(mm_skf_scoresD)
    #============================
    # Process the dict to have WF
@ -526,7 +561,8 @@ def ProcessMultModelsCl(inputD = {}, blind_test_data = True):
                sys.exit('\nFAIL: Could not merge metadata with CV and BT dfs')

        else:
-            print('\nConcatenting dfs not possible [WF],check numbers ')    
+        #    print('\nConcatenting dfs not possible [WF],check numbers ')    
+            print('\nOnly combining CV and metadata')    

        #-------------------------------------
        # Combine WF+Metadata: Final output