added Mult_clfs_logo and Mult_clsf.py with consistency

2022-07-10 12:32:52 +01:00 · 2022-07-10 12:32:52 +01:00 · de5c1270be
commit de5c1270be
parent 06f2ce97b6
5 changed files with 201 additions and 115 deletions
--- a/scripts/ml/combined_model/cm_logo_skf.py
+++ b/scripts/ml/combined_model/cm_logo_skf.py
@ -105,6 +105,7 @@ def CMLogoSkf(cm_input_df
        , target_var = 'dst_mode'
        , gene_group = 'gene_name'
        , std_gene_omit = []
+        , output_dir = outdir
        , file_suffix = ""
        ):

@ -138,7 +139,7 @@ def CMLogoSkf(cm_input_df
        # else:
        #     file_suffix = file_suffix

-        outFile = outdir + str(n_tr_genes+1) + "genes_" + tts_split_type + '_' + file_suffix +  ".csv"
+        outFile = output_dir + str(n_tr_genes+1) + "genes_" + tts_split_type + '_' + file_suffix +  ".csv"
               
        print(outFile)
    
@ -170,7 +171,7 @@ def CMLogoSkf(cm_input_df
        #cm_bts_y = cm_test_df.loc[:, 'dst_mode']
        cm_bts_y = cm_test_df.loc[:, target_var]
    
-        print('\nTEST data dim:', cm_bts_X.shape
+        print('\nTEST data dim:'     , cm_bts_X.shape
              , '\nTEST Target dim:', cm_bts_y.shape)
        
        print("Running Multiple models on LOGO with SKF")
@ -209,4 +210,4 @@ def CMLogoSkf(cm_input_df
 # Actual Data
 #===============
 CMLogoSkf(cm_input_df = combined_df_actual, file_suffix = "actual")
-CMLogoSkf(cm_input_df = combined_df_actual, std_gene_omit=['alr'], file_suffix = "actual")
+# CMLogoSkf(cm_input_df = combined_df_actual, std_gene_omit=['alr'], file_suffix = "actual")
--- a/scripts/ml/ml_functions/MultClfs.py
+++ b/scripts/ml/ml_functions/MultClfs.py
@ -74,10 +74,13 @@ from sklearn.impute import KNNImputer as KNN
 import json
 import argparse
 import re
+import itertools
+from sklearn.model_selection import LeaveOneGroupOut
 from sklearn.decomposition import PCA
+
 #%% GLOBALS
-rs = {'random_state': 42}
-njobs = {'n_jobs': os.cpu_count() } # the number of jobs should equal the number of CPU cores
+#rs = {'random_state': 42}
+#njobs = {'n_jobs': os.cpu_count() } # the number of jobs should equal the number of CPU cores

 scoring_fn =  ({ 'mcc'        : make_scorer(matthews_corrcoef)
                , 'fscore'    : make_scorer(f1_score)
@ -88,13 +91,13 @@ scoring_fn =  ({ 'mcc'        : make_scorer(matthews_corrcoef)
                , 'jcc'       : make_scorer(jaccard_score)
            }) 
  
-skf_cv = StratifiedKFold(n_splits = 10
-                          #, shuffle = False, random_state= None)
-                           , shuffle = True,**rs)
+#skf_cv = StratifiedKFold(n_splits = 10
+#                          #, shuffle = False, random_state= None)
+#                           , shuffle = True,**rs)

-rskf_cv = RepeatedStratifiedKFold(n_splits = 10
-                                  , n_repeats = 3
-                                  , **rs)
+#rskf_cv = RepeatedStratifiedKFold(n_splits = 10
+#                                  , n_repeats = 3
+#                                  , **rs)

 mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)}
 jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
@ -137,6 +140,7 @@ scoreBT_mapD = {'bts_mcc'          : 'MCC'
                , 'bts_jcc'        : 'JCC'
               }

+#gene_group = 'gene_name'
 #%%############################################################################
 ############################
 # MultModelsCl()
@ -145,17 +149,23 @@ scoreBT_mapD = {'bts_mcc'          : 'MCC'
 # Multiple Classification - Model Pipeline
 def MultModelsCl(input_df, target
                       , sel_cv
-                       , blind_test_df
-                       , blind_test_target
                       , tts_split_type
+                       , resampling_type
+                       #, group = None
                       
-                       , resampling_type = 'none' # default
                       , add_cm = True # adds confusion matrix based on cross_val_predict
                       , add_yn = True  # adds target var class numbers
                       , var_type = ['numerical', 'categorical','mixed']
                       , scale_numeric = ['min_max', 'std', 'min_max_neg', 'none'] 
+
                       , run_blind_test = True
-                       , return_formatted_output = True):
+                       , blind_test_df = pd.DataFrame()
+                       , blind_test_target = pd.Series(dtype = int)
+                       , return_formatted_output = True
+
+                       , random_state = 42
+                       , n_jobs = os.cpu_count() # the number of jobs should equal the number of CPU cores
+                       ):

    '''
    @ param input_df: input features 
@ -174,6 +184,24 @@ def MultModelsCl(input_df, target
    Dict containing multiple classification scores for each model and mean of each Stratified Kfold including training
    '''
    
+#%% Func globals        
+    rs = {'random_state': random_state}
+    njobs = {'n_jobs': n_jobs}
+    
+    skf_cv = StratifiedKFold(n_splits = 10
+                              #, shuffle = False, random_state= None)
+                              , shuffle = True,**rs)
+
+    rskf_cv = RepeatedStratifiedKFold(n_splits = 10
+                                      , n_repeats = 3
+                                      , **rs)
+    logo = LeaveOneGroupOut()
+
+    # select CV type:           
+    # if group == None:
+    #     sel_cv = skf_cv
+    # else: 
+    #     sel_cv = logo
    #======================================================
    # Determine categorical and numerical features
    #======================================================
@ -196,8 +224,9 @@ def MultModelsCl(input_df, target
    # #     t = [('num', MinMaxScaler(), numerical_ix)
    # #         , ('cat', OneHotEncoder(), categorical_ix) ]

-    # if var_type == 'mixed':
-    #     t = [('cat', OneHotEncoder(), categorical_ix) ]
+    # col_transform = ColumnTransformer(transformers = t
+    #                                    , remainder='passthrough')
+    
    if type(var_type) == list: 
        var_type = str(var_type[0])
    else:
@ -229,37 +258,37 @@ def MultModelsCl(input_df, target
    #======================================================
    # Specify multiple Classification Models  
    #======================================================
-    models = [('AdaBoost Classifier'     , AdaBoostClassifier(**rs) )
-               # , ('Bagging Classifier'        , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True, verbose = 3, n_estimators = 100) )
-               # , ('Decision Tree'             , DecisionTreeClassifier(**rs) ) 
-               #  , ('Extra Tree'                , ExtraTreeClassifier(**rs) )
-               #  , ('Extra Trees'               , ExtraTreesClassifier(**rs) ) 
-               #  , ('Gradient Boosting'         , GradientBoostingClassifier(**rs) )
-               #  , ('Gaussian NB'               , GaussianNB() )
-               #  , ('Gaussian Process'          , GaussianProcessClassifier(**rs) )
-               #  , ('K-Nearest Neighbors'       , KNeighborsClassifier() ) 
-                 , ('LDA'                       , LinearDiscriminantAnalysis() )
-               # , ('Logistic Regression'       , LogisticRegression(**rs) )
-               # , ('Logistic RegressionCV'     , LogisticRegressionCV(cv = 3, **rs))
-               # , ('MLP'                       , MLPClassifier(max_iter = 500, **rs) ) 
-                #, ('Multinomial'               , MultinomialNB() )
-                # , ('Naive Bayes'               , BernoulliNB() )
-                # , ('Passive Aggresive'         , PassiveAggressiveClassifier(**rs, **njobs) )
-                # , ('QDA'                       , QuadraticDiscriminantAnalysis() )
-                # , ('Random Forest'             , RandomForestClassifier(**rs, n_estimators = 1000, **njobs ) ) 
-                # # , ('Random Forest2'            , RandomForestClassifier(min_samples_leaf = 5
-                #                                                         , n_estimators     = 1000
-                #                                                         , bootstrap        = True
-                #                                                         , oob_score        = True
-                #                                                         , **njobs
-                #                                                         , **rs
-                #                                                         , max_features     = 'auto') ) 
-             #     , ('Ridge Classifier'          , RidgeClassifier(**rs)  )
-             #     , ('Ridge ClassifierCV'        , RidgeClassifierCV(cv = 3) )          
-             #     , ('SVC'                       , SVC(**rs) ) 
-             #     , ('Stochastic GDescent'       , SGDClassifier(**rs, **njobs) )
-             #     , ('XGBoost'                   , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False, **njobs) )
-             # 
+    models = [('AdaBoost Classifier'         , AdaBoostClassifier(**rs) )
+              , ('Bagging Classifier'        , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True, verbose = 3, n_estimators = 100) )
+              , ('Decision Tree'             , DecisionTreeClassifier(**rs) ) 
+              , ('Extra Tree'                , ExtraTreeClassifier(**rs) )
+              , ('Extra Trees'               , ExtraTreesClassifier(**rs) ) 
+              , ('Gradient Boosting'         , GradientBoostingClassifier(**rs) )
+              , ('Gaussian NB'               , GaussianNB() )
+              , ('Gaussian Process'          , GaussianProcessClassifier(**rs) )
+              , ('K-Nearest Neighbors'       , KNeighborsClassifier() ) 
+              , ('LDA'                       , LinearDiscriminantAnalysis() )
+              , ('Logistic Regression'       , LogisticRegression(**rs) )
+              , ('Logistic RegressionCV'     , LogisticRegressionCV(cv = 3, **rs))
+              , ('MLP'                       , MLPClassifier(max_iter = 500, **rs) ) 
+              , ('Multinomial'               , MultinomialNB() )
+              , ('Naive Bayes'               , BernoulliNB() )
+              , ('Passive Aggresive'         , PassiveAggressiveClassifier(**rs, **njobs) )
+              , ('QDA'                       , QuadraticDiscriminantAnalysis() )
+              , ('Random Forest'             , RandomForestClassifier(**rs, n_estimators = 1000, **njobs ) ) 
+              , ('Random Forest2'            , RandomForestClassifier(min_samples_leaf = 5
+                                                                         , n_estimators     = 1000
+                                                                         , bootstrap        = True
+                                                                         , oob_score        = True
+                                                                         , **njobs
+                                                                         , **rs
+                                                                         , max_features     = 'auto') ) 
+              , ('Ridge Classifier'          , RidgeClassifier(**rs)  )
+              , ('Ridge ClassifierCV'        , RidgeClassifierCV(cv = 3) )          
+              , ('SVC'                       , SVC(**rs) ) 
+              , ('Stochastic GDescent'       , SGDClassifier(**rs, **njobs) )
+              , ('XGBoost'                   , XGBClassifier(**rs, verbosity = 0, use_label_encoder = False, **njobs) )
+              
             ]
                
    mm_skf_scoresD = {}
@ -289,10 +318,11 @@ def MultModelsCl(input_df, target
            
            
        print('\nRunning model pipeline:', model_pipeline)
-        skf_cv_modD = cross_validate(model_pipeline
+        cv_modD = cross_validate(model_pipeline
                              , input_df
                              , target
                              , cv = sel_cv
+                              #, groups = group
                              , scoring = scoring_fn
                              , return_train_score = True)
        #==============================
@ -300,7 +330,7 @@ def MultModelsCl(input_df, target
        #==============================
        mm_skf_scoresD[model_name] = {}
        
-        for key, value in skf_cv_modD.items():
+        for key, value in cv_modD.items():
            print('\nkey:', key, '\nvalue:', value)
            print('\nmean value:', np.mean(value))
            mm_skf_scoresD[model_name][key] = round(np.mean(value),2)
@ -308,7 +338,7 @@ def MultModelsCl(input_df, target
        # ADD more info: meta data related to input df
        mm_skf_scoresD[model_name]['resampling']        = resampling_type
        mm_skf_scoresD[model_name]['n_training_size']   = len(input_df)
-        #mm_skf_scoresD[model_name]['n_trainingY_ratio'] = round(Counter(target)[0]/Counter(target)[1], 2)
+        mm_skf_scoresD[model_name]['n_trainingY_ratio'] = round(Counter(target)[0]/Counter(target)[1], 2)
        mm_skf_scoresD[model_name]['n_features']        = len(input_df.columns)
        mm_skf_scoresD[model_name]['tts_split']         = tts_split_type
        
@ -321,7 +351,12 @@ def MultModelsCl(input_df, target
           cmD = {}

            # Calculate cm         
-           y_pred   = cross_val_predict(model_pipeline, input_df, target, cv = sel_cv, **njobs)
+           y_pred   = cross_val_predict(model_pipeline
+                                        , input_df
+                                        , target
+                                        , cv = sel_cv
+                                        #, groups = group
+                                        , **njobs)
            #_tn, _fp, _fn, _tp = confusion_matrix(y_pred, y).ravel() # internally
           tn, fp, fn, tp = confusion_matrix(y_pred, target).ravel()
    
@ -357,7 +392,7 @@ def MultModelsCl(input_df, target
           # Build bts numbers dict
           btD = {'n_blindY_neg'    : Counter(blind_test_target)[0]
                  , 'n_blindY_pos'  : Counter(blind_test_target)[1]
-                  #, 'n_testY_ratio' : round(Counter(blind_test_target)[0]/Counter(blind_test_target)[1], 2)
+                  , 'n_testY_ratio' : round(Counter(blind_test_target)[0]/Counter(blind_test_target)[1], 2)
                  , 'n_test_size'   : len(blind_test_df) }
           
           # Update cmD+tnD dicts with btD
@ -371,9 +406,9 @@ def MultModelsCl(input_df, target
           bts_predict = model_pipeline.predict(blind_test_df)
           
           bts_mcc_score = round(matthews_corrcoef(blind_test_target, bts_predict),2)
-           print('\nMCC on Blind test:'     , bts_mcc_score)
+           print('\nMCC on Blind test:'      , bts_mcc_score)
           #print('\nAccuracy on Blind test:', round(accuracy_score(blind_test_target, bts_predict),2))
-           print('\nMCC on Training:'      , mm_skf_scoresD[model_name]['test_mcc'] )
+           print('\nMCC on Training:'        , mm_skf_scoresD[model_name]['test_mcc'] )

           mm_skf_scoresD[model_name]['bts_mcc']       = bts_mcc_score
           mm_skf_scoresD[model_name]['bts_fscore']    = round(f1_score(blind_test_target, bts_predict),2)
@ -526,7 +561,8 @@ def ProcessMultModelsCl(inputD = {}, blind_test_data = True):
                sys.exit('\nFAIL: Could not merge metadata with CV and BT dfs')

        else:
-            print('\nConcatenting dfs not possible [WF],check numbers ')    
+        #    print('\nConcatenting dfs not possible [WF],check numbers ')    
+            print('\nOnly combining CV and metadata')    

        #-------------------------------------
        # Combine WF+Metadata: Final output
--- a/scripts/ml/ml_functions/MultClfs_logo_skf.py
+++ b/scripts/ml/ml_functions/MultClfs_logo_skf.py
@ -76,7 +76,12 @@ import argparse
 import re
 import itertools
 from sklearn.model_selection import LeaveOneGroupOut
+from sklearn.decomposition import PCA
+
 #%% GLOBALS
+#rs = {'random_state': 42}
+#njobs = {'n_jobs': os.cpu_count() } # the number of jobs should equal the number of CPU cores
+
 scoring_fn =  ({ 'mcc'        : make_scorer(matthews_corrcoef)
                , 'fscore'    : make_scorer(f1_score)
                , 'precision' : make_scorer(precision_score)
@ -86,7 +91,13 @@ scoring_fn =  ({ 'mcc'        : make_scorer(matthews_corrcoef)
                , 'jcc'       : make_scorer(jaccard_score)
            }) 
  
+#skf_cv = StratifiedKFold(n_splits = 10
+#                          #, shuffle = False, random_state= None)
+#                           , shuffle = True,**rs)

+#rskf_cv = RepeatedStratifiedKFold(n_splits = 10
+#                                  , n_repeats = 3
+#                                  , **rs)

 mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)}
 jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
@ -139,21 +150,23 @@ scoreBT_mapD = {'bts_mcc'          : 'MCC'
 def MultModelsCl_logo_skf(input_df
                       , target
                       , sel_cv
+                       , tts_split_type
+                       , resampling_type
+                       #, group = None
                       
-                       , blind_test_df = pd.DataFrame()
-                       , blind_test_target = pd.Series(dtype = int)
-                       , tts_split_type = "none"
-                       #, group = 'none'
-
-                       , resampling_type = 'none' # default
                       , add_cm = True # adds confusion matrix based on cross_val_predict
                       , add_yn = True  # adds target var class numbers
                       , var_type = ['numerical', 'categorical','mixed']
+                       , scale_numeric = ['min_max', 'std', 'min_max_neg', 'none'] 
+
                       , run_blind_test = True
+                       , blind_test_df = pd.DataFrame()
+                       , blind_test_target = pd.Series(dtype = int)
                       , return_formatted_output = True
+
                       , random_state = 42
                       , n_jobs = os.cpu_count() # the number of jobs should equal the number of CPU cores
-                       , ):
+                       ):

    '''
    @ param input_df: input features 
@ -165,7 +178,7 @@ def MultModelsCl_logo_skf(input_df
    @param skv_cv: stratifiedK fold int or object to allow shuffle and random state to pass
    @type: int or StratifiedKfold()
    
-    @var_type: numerical, categorical and mixed to determine what col_transform to apply (MinMaxScalar and/or one-ho    t encoder)
+    @var_type: numerical, categorical and mixed to determine what col_transform to apply (MinMaxScalar and/or one-hot encoder)
    @type: list

    returns
@ -185,8 +198,8 @@ def MultModelsCl_logo_skf(input_df
                                      , **rs)
    logo = LeaveOneGroupOut()

-    # # select CV type:           
-    # if group == 'none':
+    # select CV type:           
+    # if group == None:
    #     sel_cv = skf_cv
    # else: 
    #     sel_cv = logo
@ -201,52 +214,81 @@ def MultModelsCl_logo_skf(input_df
    #======================================================
    # Determine preprocessing steps ~ var_type
    #======================================================
-    if var_type == 'numerical':
-        t = [('num', MinMaxScaler(), numerical_ix)]
+   
+    # if var_type == 'numerical':
+    #     t = [('num', MinMaxScaler(), numerical_ix)]
+
+    # if var_type == 'categorical':
+    #     t = [('cat', OneHotEncoder(), categorical_ix)]
+    
+    # # if var_type == 'mixed':
+    # #     t = [('num', MinMaxScaler(), numerical_ix)
+    # #         , ('cat', OneHotEncoder(), categorical_ix) ]
+
+    # col_transform = ColumnTransformer(transformers = t
+    #                                    , remainder='passthrough')
+    
+    if type(var_type) == list: 
+        var_type = str(var_type[0])
+    else:
+        var_type = var_type
+    
+    if var_type in ['numerical','mixed']:
+        if scale_numeric == ['none']:
+            t = [('cat', OneHotEncoder(), categorical_ix)]
+        if scale_numeric != ['none']:
+            if scale_numeric == ['min_max']:
+                scaler = MinMaxScaler()
+            if scale_numeric == ['min_max_neg']:
+                scaler = MinMaxScaler(feature_range=(-1, 1))
+            if scale_numeric == ['std']:
+                scaler = StandardScaler()
+            
+            t = [('num', scaler, numerical_ix)
+              , ('cat', OneHotEncoder(), categorical_ix)]
+        
                
    if var_type == 'categorical':
        t = [('cat', OneHotEncoder(), categorical_ix)]

-    if var_type == 'mixed':
-        t = [('num', MinMaxScaler(), numerical_ix)
-            , ('cat', OneHotEncoder(), categorical_ix) ]
        
    col_transform = ColumnTransformer(transformers = t
                                       , remainder='passthrough')
    
+    
    #======================================================
    # Specify multiple Classification Models  
    #======================================================
-    models = [('AdaBoost Classifier'          , AdaBoostClassifier(**rs) )
-               , ('Bagging Classifier'        , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True) )
-               , ('Decision Tree'             , DecisionTreeClassifier(**rs) ) 
-               , ('Extra Tree'                , ExtraTreeClassifier(**rs) )
-               , ('Extra Trees'               , ExtraTreesClassifier(**rs) ) 
-               , ('Gradient Boosting'         , GradientBoostingClassifier(**rs) )
-               , ('Gaussian NB'               , GaussianNB() )
-               , ('Gaussian Process'          , GaussianProcessClassifier(**rs) )
-               , ('K-Nearest Neighbors'       , KNeighborsClassifier() ) 
-               , ('LDA'                       , LinearDiscriminantAnalysis() )
-               , ('Logistic Regression'       , LogisticRegression(**rs) )
-               , ('Logistic RegressionCV'     , LogisticRegressionCV(cv = 3, **rs))
-               , ('MLP'                       , MLPClassifier(max_iter = 500, **rs) ) 
-               , ('Multinomial'               , MultinomialNB() )
-               , ('Naive Bayes'               , BernoulliNB() )
-               , ('Passive Aggresive'         , PassiveAggressiveClassifier(**rs, **njobs) )
-               , ('QDA'                       , QuadraticDiscriminantAnalysis() )
-               , ('Random Forest'             , RandomForestClassifier(**rs, n_estimators = 1000 ) ) 
-               , ('Random Forest2'            , RandomForestClassifier(min_samples_leaf = 5
-                                                                       , n_estimators     = 1000
-                                                                       , bootstrap        = True
-                                                                       , oob_score        = True
-                                                                       , **njobs
-                                                                       , **rs
-                                                                       , max_features     = 'auto') ) 
-                , ('Ridge Classifier'          , RidgeClassifier(**rs)  )
-                , ('Ridge ClassifierCV'        , RidgeClassifierCV(cv = 3) )          
-                , ('SVC'                       , SVC(**rs) ) 
-                , ('Stochastic GDescent'       , SGDClassifier(**rs, **njobs) )
-                , ('XGBoost'                   , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False) )
+    models = [('AdaBoost Classifier'            , AdaBoostClassifier(**rs) )
+               , ('Bagging Classifier'          , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True, verbose = 3, n_estimators = 100) )
+               # , ('Decision Tree'             , DecisionTreeClassifier(**rs) ) 
+               # , ('Extra Tree'                , ExtraTreeClassifier(**rs) )
+               # , ('Extra Trees'               , ExtraTreesClassifier(**rs) ) 
+               # , ('Gradient Boosting'         , GradientBoostingClassifier(**rs) )
+               # , ('Gaussian NB'               , GaussianNB() )
+               # , ('Gaussian Process'          , GaussianProcessClassifier(**rs) )
+               # , ('K-Nearest Neighbors'       , KNeighborsClassifier() ) 
+               # , ('LDA'                       , LinearDiscriminantAnalysis() )
+               # , ('Logistic Regression'       , LogisticRegression(**rs) )
+               # , ('Logistic RegressionCV'     , LogisticRegressionCV(cv = 3, **rs))
+               # , ('MLP'                       , MLPClassifier(max_iter = 500, **rs) ) 
+               # , ('Multinomial'               , MultinomialNB() )
+               # , ('Naive Bayes'               , BernoulliNB() )
+               # , ('Passive Aggresive'         , PassiveAggressiveClassifier(**rs, **njobs) )
+               # , ('QDA'                       , QuadraticDiscriminantAnalysis() )
+               # , ('Random Forest'             , RandomForestClassifier(**rs, n_estimators = 1000, **njobs ) ) 
+               # , ('Random Forest2'            , RandomForestClassifier(min_samples_leaf = 5
+               #                                                         , n_estimators     = 1000
+               #                                                         , bootstrap        = True
+               #                                                         , oob_score        = True
+               #                                                         , **njobs
+               #                                                         , **rs
+               #                                                         , max_features     = 'auto') ) 
+               #  , ('Ridge Classifier'          , RidgeClassifier(**rs)  )
+               #  , ('Ridge ClassifierCV'        , RidgeClassifierCV(cv = 3) )          
+               #  , ('SVC'                       , SVC(**rs) ) 
+               #  , ('Stochastic GDescent'       , SGDClassifier(**rs, **njobs) )
+               #  , ('XGBoost'                   , XGBClassifier(**rs, verbosity = 0, use_label_encoder = False, **njobs) )
             ]
                
    mm_skf_scoresD = {}
@ -269,6 +311,12 @@ def MultModelsCl_logo_skf(input_df
            ('prep'     , col_transform)
            , ('model'  , model_fn)])
        
+        # model_pipeline = Pipeline([
+        #     ('prep'     , col_transform)
+        #     ,  ('pca'   , PCA(n_components = 2))
+        #     , ('model'  , model_fn)])
+            
+            
        print('\nRunning model pipeline:', model_pipeline)
        cv_modD = cross_validate(model_pipeline
                              , input_df
@ -358,8 +406,9 @@ def MultModelsCl_logo_skf(input_df
           bts_predict = model_pipeline.predict(blind_test_df)
           
           bts_mcc_score = round(matthews_corrcoef(blind_test_target, bts_predict),2)
-           print('\nMCC on Blind test:'     , bts_mcc_score)
-           print('\nAccuracy on Blind test:', round(accuracy_score(blind_test_target, bts_predict),2))
+           print('\nMCC on Blind test:'      , bts_mcc_score)
+           #print('\nAccuracy on Blind test:', round(accuracy_score(blind_test_target, bts_predict),2))
+           print('\nMCC on Training:'        , mm_skf_scoresD[model_name]['test_mcc'] )

           mm_skf_scoresD[model_name]['bts_mcc']       = bts_mcc_score
           mm_skf_scoresD[model_name]['bts_fscore']    = round(f1_score(blind_test_target, bts_predict),2)
@ -387,8 +436,7 @@ def MultModelsCl_logo_skf(input_df
 ############################
 #Processes the dict from above if use_formatted_output = True 

-def ProcessMultModelsCl(inputD = {}
-                        , blind_test_data = True):
+def ProcessMultModelsCl(inputD = {}, blind_test_data = True):
    
    scoresDF = pd.DataFrame(inputD)
    
--- a/scripts/ml/ml_functions/test_func_singlegene.py
+++ b/scripts/ml/ml_functions/test_func_singlegene.py
@ -26,7 +26,7 @@ skf_cv = StratifiedKFold(n_splits = 10
 #                                   , n_repeats = 3
 #                                   , **rs)
 # param dict for getmldata()
-gene_model_paramD = {'data_combined_model'   : False
+gene_model_paramD = {'data_combined_model'       : False
                    , 'use_or'                   : False
                    , 'omit_all_genomic_features': False
                    , 'write_maskfile'           : False
@ -77,7 +77,7 @@ fooD = MultModelsCl(input_df = df2['X_ros']
                , blind_test_df =  df2['X_bts']
                , blind_test_target =  df2['y_bts']
                , tts_split_type  = spl_type
-                , resampling_type = 'none' # default
+                , resampling_type = 'XXXX' # default
                , var_type = ['mixed']
                , scale_numeric = ['min_max']
                , return_formatted_output = False
--- a/scripts/ml/ml_iterator.py
+++ b/scripts/ml/ml_iterator.py
@ -93,6 +93,7 @@ for gene, drug in ml_gene_drugD.items():
                                    , sel_cv = skf_cv
                                    , blind_test_df =  tempD['X_bts']
                                    , blind_test_target = tempD['y_bts']
+                                    , scale_numeric = ['min_max']
                                    , add_cm = True 
                                    , add_yn = True
                                    , return_formatted_output = True)
@ -103,5 +104,5 @@ for gene, drug in ml_gene_drugD.items():
                out_wf= pd.concat(mmDD, ignore_index = True)
            
            out_wf_f = out_wf.sort_values(by = ['resampling', 'source_data', 'MCC'], ascending = [True, True, False], inplace = False)
-            out_wf_f.to_csv(('/home/tanu/git/Data/ml_combined/genes/'+ out_filename), index = False)
+            out_wf_f.to_csv(out_filename, index = False)