tidying script to run from cmd and via ssh

2022-05-28 09:40:24 +01:00 · 2022-05-28 09:40:24 +01:00 · b6f0308e42
commit b6f0308e42
parent 0a84a4b4dc
4 changed files with 271 additions and 76 deletions
--- a/uq_ml_models_FS/scriptfsycm.py
+++ b/uq_ml_models_FS/scriptfsycm.py
@ -27,17 +27,42 @@ from sklearn.model_selection import train_test_split, cross_validate, cross_val_
 # Metric
 from sklearn.metrics import mean_squared_error, make_scorer, roc_auc_score, f1_score, matthews_corrcoef, accuracy_score, balanced_accuracy_score, confusion_matrix, classification_report

+# other vars
+rs = {'random_state': 42}
+njobs = {'n_jobs': 10}
+
+scoring_fn =  ({'accuracy'      : make_scorer(accuracy_score)
+                 , 'fscore'     : make_scorer(f1_score)
+                 , 'mcc'        : make_scorer(matthews_corrcoef)
+                 , 'precision'  : make_scorer(precision_score)
+                 , 'recall'     : make_scorer(recall_score)
+                 , 'roc_auc'    : make_scorer(roc_auc_score)
+                 , 'jcc'        : make_scorer(jaccard_score)
+            }) 
+  
+skf_cv = StratifiedKFold(n_splits = 10
+                          #, shuffle = False, random_state= None)
+                           , shuffle = True,**rs)
+
+rskf_cv = RepeatedStratifiedKFold(n_splits = 10
+                                  , n_repeats = 3
+                                  , **rs)
+
+mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)}
+jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
+#%% YC 
 #def run_all_ML(input_pd, target_label, bts_input, bts_target, var_type):
-def run_all_ML(input_pd, target_label, preprocess = True, var_type = 'numerical'):
+def run_all_ML(input_pd, target_label, blind_test_input_df, blind_test_target, preprocess = True, var_type = 'numerical'):

    #y = input_pd[target_label]
    #X = input_pd.drop(target_label,axis=1)
    y = target_label
    X = input_pd
-    # determine categorical and numerical features
-    numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
+    
+    # Determine categorical and numerical features
+    numerical_ix = input_pd.select_dtypes(include=['int64', 'float64']).columns
    numerical_ix
-    categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns
+    categorical_ix = input_pd.select_dtypes(include=['object', 'bool']).columns
    categorical_ix    

    # Determine preprocessing steps ~ var_type
@ -53,17 +78,21 @@ def run_all_ML(input_pd, target_label, preprocess = True, var_type = 'numerical'
        
    col_transform = ColumnTransformer(transformers = t
                                       , remainder='passthrough')
-    result_pd = pd.DataFrame()
+    result_pd     = pd.DataFrame()
+    result_bts_pd = pd.DataFrame()
+    #results_btsD = {}
+    results_all = {}
+    
    for name, algorithm in all_estimators(type_filter="classifier"):
        try:
            estmator = algorithm()
            temp_pd = pd.DataFrame()
            temp_cm = pd.DataFrame()

-            # orig
-            pipe = Pipeline([
-                ("model"    , algorithm())
-            ])
+            # # orig
+            # pipe = Pipeline([
+            #     ("model"    , algorithm())
+            # ])
            
            # turn on and off preprocessing
            if preprocess == True:
@ -76,11 +105,17 @@ def run_all_ML(input_pd, target_label, preprocess = True, var_type = 'numerical'
                    ("model"    , algorithm())
                ])
                
-            
-            y_pred = cross_val_predict(pipe, X, y, cv = 10, n_jobs=10)
-            _mcc = round(matthews_corrcoef(y_pred, y), 3)
-            _bacc = round(balanced_accuracy_score(y_pred, y), 3)
-            _f1 = round(f1_score(y_pred, y), 3)
+            # cross val scores
+            y_pred   = cross_val_predict(pipe, X, y, cv = 10, **njobs)
+# CHANGE to cross_validate: ONLY THEN CAN YOU TRUST
+            # y_pred   = cross_validate(pipe, X, y
+            #                           , cv = 10
+            #                           , scoring = scoring_fn
+            #                           , **njobs)
+
+            _mcc     = round(matthews_corrcoef(y_pred, y), 3)
+            _bacc    = round(balanced_accuracy_score(y_pred, y), 3)
+            _f1      = round(f1_score(y_pred, y), 3)
            _roc_auc = round(roc_auc_score(y_pred, y), 3)
            _tn, _fp, _fn, _tp = confusion_matrix(y_pred, y).ravel()
            
@ -88,7 +123,88 @@ def run_all_ML(input_pd, target_label, preprocess = True, var_type = 'numerical'
                                                      columns=['estimator', 'TP', 'TN', 'FP', 'FN',
                                                               'roc_auc', 'matthew', 'bacc', 'f1']),\
                                         ignore_index=True)
+            #=========================
+            # Blind test: BTS results
+            #=========================
+            #Build the final results with all scores for a feature selected model
+            pipe.fit(input_pd, target_label)
+            bts_predict = pipe.predict(blind_test_input_df)
+
+            bts_mcc_score = round(matthews_corrcoef(blind_test_target, bts_predict),2)
+            print('\nMCC on Blind test:'     , bts_mcc_score)
+            #print('\nAccuracy on Blind test:', round(accuracy_score(blind_test_target, bts_predict),2))
+            
+            _mccBTS     = round(matthews_corrcoef(bts_predict, blind_test_target), 3)
+            _baccBTS    = round(balanced_accuracy_score(bts_predict, blind_test_target), 3)
+            _f1BTS      = round(f1_score(bts_predict, blind_test_target), 3)
+            _roc_aucBTS = round(roc_auc_score(bts_predict, blind_test_target), 3)
+            _tnBTS, _fpBTS, _fnBTS, _tpBTS = confusion_matrix(bts_predict, blind_test_target).ravel()
+            
+            result_bts_pd = result_bts_pd.append(pd.DataFrame(np.column_stack([name
+                                                                            , _tpBTS, _tnBTS
+                                                                            , _fpBTS, _fnBTS
+                                                                            , _roc_aucBTS
+                                                                            , _mccBTS
+                                                                            , _baccBTS, _f1BTS]),\
+                                                      columns=['estimator', 'TP', 'TN', 'FP', 'FN',
+                                                                'roc_auc', 'matthew', 'bacc', 'f1']),\
+                                          ignore_index=True)
+      
+            
+            results_all['CrossValResultsDF']   = result_pd
+            results_all['BlindTestResultsDF']  = result_bts_pd
+
        except Exception as e:
-            print("Got an error while running {}".format(name))
+            print("XXXGot an error while running {}".format(name))
            print(e)
-    return(result_pd)
+            
+            
+    #return(result_pd)    
+    return(results_all)
+    
+
+#%% CALL function
+#run_all_ML(input_pd=X, target_label=y, blind_test_input_df=X_bts, blind_test_target=y_bts, preprocess = True, var_type = 'mixed')
+
+YC_resD2 = run_all_ML(input_pd=X, target_label=y, blind_test_input_df=X_bts, blind_test_target=y_bts, preprocess = True, var_type = 'mixed')
+
+YC_resD_ros = run_all_ML(input_pd=X_ros, target_label=y_ros, blind_test_input_df=X_bts, blind_test_target=y_bts, preprocess = True, var_type = 'mixed')
+
+CVResultsDF = YC_resD2['CrossValResultsDF']
+CVResultsDF.sort_values(by=['matthew'], ascending=False, inplace=True)
+BTSResultsDF = YC_resD2['BlindTestResultsDF']
+BTSResultsDF.sort_values(by=['matthew'], ascending=False, inplace=True)
+
+# from sklearn.utils import all_estimators
+# for name, algorithm in all_estimators(type_filter="classifier"):
+#     clf = algorithm()
+#     print('Name:', name, '\nAlgo:', clf)
+
+# Random Oversampling
+YC_resD_ros = run_all_ML(input_pd=X_ros, target_label=y_ros, blind_test_input_df=X_bts, blind_test_target=y_bts, preprocess = True, var_type = 'mixed')
+CVResultsDF_ros = YC_resD_ros['CrossValResultsDF']
+CVResultsDF_ros.sort_values(by=['matthew'], ascending=False, inplace=True)
+BTSResultsDF_ros = YC_resD_ros['BlindTestResultsDF']
+BTSResultsDF_ros.sort_values(by=['matthew'], ascending=False, inplace=True)
+
+# Random Undersampling
+YC_resD_rus = run_all_ML(input_pd=X_rus, target_label=y_rus, blind_test_input_df=X_bts, blind_test_target=y_bts, preprocess = True, var_type = 'mixed')
+CVResultsDF_rus = YC_resD_rus['CrossValResultsDF']
+CVResultsDF_rus.sort_values(by=['matthew'], ascending=False, inplace=True)
+BTSResultsDF_rus = YC_resD_rus['BlindTestResultsDF']
+BTSResultsDF_rus.sort_values(by=['matthew'], ascending=False, inplace=True)
+
+# Random Oversampling+Undersampling
+YC_resD_rouC = run_all_ML(input_pd=X_rouC, target_label=y_rouC, blind_test_input_df=X_bts, blind_test_target=y_bts, preprocess = True, var_type = 'mixed')
+CVResultsDF_rouC = YC_resD_rouC['CrossValResultsDF']
+CVResultsDF_rouC.sort_values(by=['matthew'], ascending=False, inplace=True)
+BTSResultsDF_rouC = YC_resD_rouC['BlindTestResultsDF']
+BTSResultsDF_rouC.sort_values(by=['matthew'], ascending=False, inplace=True)
+
+# SMOTE NC
+YC_resD_smnc = run_all_ML(input_pd=X_smnc, target_label=y_smnc, blind_test_input_df=X_bts, blind_test_target=y_bts, preprocess = True, var_type = 'mixed')
+CVResultsDF_smnc = YC_resD_smnc['CrossValResultsDF']
+CVResultsDF_smnc.sort_values(by=['matthew'], ascending=False, inplace=True)
+BTSResultsDF_smnc = YC_resD_smnc['BlindTestResultsDF']
+BTSResultsDF_smnc.sort_values(by=['matthew'], ascending=False, inplace=True)
+