From de5c1270be1a8b5927749f21ea2749ea04f1a174 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Sun, 10 Jul 2022 12:32:52 +0100
Subject: [PATCH] added Mult_clfs_logo and Mult_clsf.py with consistency

---
 scripts/ml/combined_model/cm_logo_skf.py      |   7 +-
 scripts/ml/ml_functions/MultClfs.py           | 150 ++++++++++-------
 scripts/ml/ml_functions/MultClfs_logo_skf.py  | 152 ++++++++++++------
 .../ml/ml_functions/test_func_singlegene.py   |   4 +-
 scripts/ml/ml_iterator.py                     |   3 +-
 5 files changed, 201 insertions(+), 115 deletions(-)

diff --git a/scripts/ml/combined_model/cm_logo_skf.py b/scripts/ml/combined_model/cm_logo_skf.py
index 4efa0f3..0ad72a2 100755
--- a/scripts/ml/combined_model/cm_logo_skf.py
+++ b/scripts/ml/combined_model/cm_logo_skf.py
@@ -105,6 +105,7 @@ def CMLogoSkf(cm_input_df
         , target_var = 'dst_mode'
         , gene_group = 'gene_name'
         , std_gene_omit = []
+        , output_dir = outdir
         , file_suffix = ""
         ):
 
@@ -138,7 +139,7 @@ def CMLogoSkf(cm_input_df
         # else:
         #     file_suffix = file_suffix
 
-        outFile = outdir + str(n_tr_genes+1) + "genes_" + tts_split_type + '_' + file_suffix +  ".csv"
+        outFile = output_dir + str(n_tr_genes+1) + "genes_" + tts_split_type + '_' + file_suffix +  ".csv"
                
         print(outFile)
     
@@ -170,7 +171,7 @@ def CMLogoSkf(cm_input_df
         #cm_bts_y = cm_test_df.loc[:, 'dst_mode']
         cm_bts_y = cm_test_df.loc[:, target_var]
     
-        print('\nTEST data dim:', cm_bts_X.shape
+        print('\nTEST data dim:'     , cm_bts_X.shape
               , '\nTEST Target dim:', cm_bts_y.shape)
         
         print("Running Multiple models on LOGO with SKF")
@@ -209,4 +210,4 @@ def CMLogoSkf(cm_input_df
 # Actual Data
 #===============
 CMLogoSkf(cm_input_df = combined_df_actual, file_suffix = "actual")
-CMLogoSkf(cm_input_df = combined_df_actual, std_gene_omit=['alr'], file_suffix = "actual")
+# CMLogoSkf(cm_input_df = combined_df_actual, std_gene_omit=['alr'], file_suffix = "actual")
diff --git a/scripts/ml/ml_functions/MultClfs.py b/scripts/ml/ml_functions/MultClfs.py
index 1f46df9..3e6c729 100755
--- a/scripts/ml/ml_functions/MultClfs.py
+++ b/scripts/ml/ml_functions/MultClfs.py
@@ -74,10 +74,13 @@ from sklearn.impute import KNNImputer as KNN
 import json
 import argparse
 import re
+import itertools
+from sklearn.model_selection import LeaveOneGroupOut
 from sklearn.decomposition import PCA
+
 #%% GLOBALS
-rs = {'random_state': 42}
-njobs = {'n_jobs': os.cpu_count() } # the number of jobs should equal the number of CPU cores
+#rs = {'random_state': 42}
+#njobs = {'n_jobs': os.cpu_count() } # the number of jobs should equal the number of CPU cores
 
 scoring_fn =  ({ 'mcc'        : make_scorer(matthews_corrcoef)
                 , 'fscore'    : make_scorer(f1_score)
@@ -88,13 +91,13 @@ scoring_fn =  ({ 'mcc'        : make_scorer(matthews_corrcoef)
                 , 'jcc'       : make_scorer(jaccard_score)
             }) 
   
-skf_cv = StratifiedKFold(n_splits = 10
-                          #, shuffle = False, random_state= None)
-                           , shuffle = True,**rs)
+#skf_cv = StratifiedKFold(n_splits = 10
+#                          #, shuffle = False, random_state= None)
+#                           , shuffle = True,**rs)
 
-rskf_cv = RepeatedStratifiedKFold(n_splits = 10
-                                  , n_repeats = 3
-                                  , **rs)
+#rskf_cv = RepeatedStratifiedKFold(n_splits = 10
+#                                  , n_repeats = 3
+#                                  , **rs)
 
 mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)}
 jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
@@ -137,6 +140,7 @@ scoreBT_mapD = {'bts_mcc'          : 'MCC'
                 , 'bts_jcc'        : 'JCC'
                }
 
+#gene_group = 'gene_name'
 #%%############################################################################
 ############################
 # MultModelsCl()
@@ -145,17 +149,23 @@ scoreBT_mapD = {'bts_mcc'          : 'MCC'
 # Multiple Classification - Model Pipeline
 def MultModelsCl(input_df, target
                        , sel_cv
-                       , blind_test_df
-                       , blind_test_target
-                       , tts_split_type 
-
-                       , resampling_type = 'none' # default
+                       , tts_split_type
+                       , resampling_type
+                       #, group = None
+                       
                        , add_cm = True # adds confusion matrix based on cross_val_predict
                        , add_yn = True  # adds target var class numbers
-                       , var_type = ['numerical', 'categorical','mixed'] 
+                       , var_type = ['numerical', 'categorical','mixed']
                        , scale_numeric = ['min_max', 'std', 'min_max_neg', 'none'] 
+
                        , run_blind_test = True
-                       , return_formatted_output = True):
+                       , blind_test_df = pd.DataFrame()
+                       , blind_test_target = pd.Series(dtype = int)
+                       , return_formatted_output = True
+
+                       , random_state = 42
+                       , n_jobs = os.cpu_count() # the number of jobs should equal the number of CPU cores
+                       ):
 
     '''
     @ param input_df: input features 
@@ -173,7 +183,25 @@ def MultModelsCl(input_df, target
     returns
     Dict containing multiple classification scores for each model and mean of each Stratified Kfold including training
     '''
+    
+#%% Func globals        
+    rs = {'random_state': random_state}
+    njobs = {'n_jobs': n_jobs}
+    
+    skf_cv = StratifiedKFold(n_splits = 10
+                              #, shuffle = False, random_state= None)
+                              , shuffle = True,**rs)
 
+    rskf_cv = RepeatedStratifiedKFold(n_splits = 10
+                                      , n_repeats = 3
+                                      , **rs)
+    logo = LeaveOneGroupOut()
+
+    # select CV type:           
+    # if group == None:
+    #     sel_cv = skf_cv
+    # else: 
+    #     sel_cv = logo
     #======================================================
     # Determine categorical and numerical features
     #======================================================
@@ -196,8 +224,9 @@ def MultModelsCl(input_df, target
     # #     t = [('num', MinMaxScaler(), numerical_ix)
     # #         , ('cat', OneHotEncoder(), categorical_ix) ]
 
-    # if var_type == 'mixed':
-    #     t = [('cat', OneHotEncoder(), categorical_ix) ]
+    # col_transform = ColumnTransformer(transformers = t
+    #                                    , remainder='passthrough')
+    
     if type(var_type) == list: 
         var_type = str(var_type[0])
     else:
@@ -229,37 +258,37 @@ def MultModelsCl(input_df, target
     #======================================================
     # Specify multiple Classification Models  
     #======================================================
-    models = [('AdaBoost Classifier'     , AdaBoostClassifier(**rs) )
-               # , ('Bagging Classifier'        , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True, verbose = 3, n_estimators = 100) )
-               # , ('Decision Tree'             , DecisionTreeClassifier(**rs) ) 
-               #  , ('Extra Tree'                , ExtraTreeClassifier(**rs) )
-               #  , ('Extra Trees'               , ExtraTreesClassifier(**rs) ) 
-               #  , ('Gradient Boosting'         , GradientBoostingClassifier(**rs) )
-               #  , ('Gaussian NB'               , GaussianNB() )
-               #  , ('Gaussian Process'          , GaussianProcessClassifier(**rs) )
-               #  , ('K-Nearest Neighbors'       , KNeighborsClassifier() ) 
-                 , ('LDA'                       , LinearDiscriminantAnalysis() )
-               # , ('Logistic Regression'       , LogisticRegression(**rs) )
-               # , ('Logistic RegressionCV'     , LogisticRegressionCV(cv = 3, **rs))
-               # , ('MLP'                       , MLPClassifier(max_iter = 500, **rs) ) 
-                #, ('Multinomial'               , MultinomialNB() )
-                # , ('Naive Bayes'               , BernoulliNB() )
-                # , ('Passive Aggresive'         , PassiveAggressiveClassifier(**rs, **njobs) )
-                # , ('QDA'                       , QuadraticDiscriminantAnalysis() )
-                # , ('Random Forest'             , RandomForestClassifier(**rs, n_estimators = 1000, **njobs ) ) 
-                # # , ('Random Forest2'            , RandomForestClassifier(min_samples_leaf = 5
-                #                                                         , n_estimators     = 1000
-                #                                                         , bootstrap        = True
-                #                                                         , oob_score        = True
-                #                                                         , **njobs
-                #                                                         , **rs
-                #                                                         , max_features     = 'auto') ) 
-             #     , ('Ridge Classifier'          , RidgeClassifier(**rs)  )
-             #     , ('Ridge ClassifierCV'        , RidgeClassifierCV(cv = 3) )          
-             #     , ('SVC'                       , SVC(**rs) ) 
-             #     , ('Stochastic GDescent'       , SGDClassifier(**rs, **njobs) )
-             #     , ('XGBoost'                   , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False, **njobs) )
-             # 
+    models = [('AdaBoost Classifier'         , AdaBoostClassifier(**rs) )
+              , ('Bagging Classifier'        , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True, verbose = 3, n_estimators = 100) )
+              , ('Decision Tree'             , DecisionTreeClassifier(**rs) ) 
+              , ('Extra Tree'                , ExtraTreeClassifier(**rs) )
+              , ('Extra Trees'               , ExtraTreesClassifier(**rs) ) 
+              , ('Gradient Boosting'         , GradientBoostingClassifier(**rs) )
+              , ('Gaussian NB'               , GaussianNB() )
+              , ('Gaussian Process'          , GaussianProcessClassifier(**rs) )
+              , ('K-Nearest Neighbors'       , KNeighborsClassifier() ) 
+              , ('LDA'                       , LinearDiscriminantAnalysis() )
+              , ('Logistic Regression'       , LogisticRegression(**rs) )
+              , ('Logistic RegressionCV'     , LogisticRegressionCV(cv = 3, **rs))
+              , ('MLP'                       , MLPClassifier(max_iter = 500, **rs) ) 
+              , ('Multinomial'               , MultinomialNB() )
+              , ('Naive Bayes'               , BernoulliNB() )
+              , ('Passive Aggresive'         , PassiveAggressiveClassifier(**rs, **njobs) )
+              , ('QDA'                       , QuadraticDiscriminantAnalysis() )
+              , ('Random Forest'             , RandomForestClassifier(**rs, n_estimators = 1000, **njobs ) ) 
+              , ('Random Forest2'            , RandomForestClassifier(min_samples_leaf = 5
+                                                                         , n_estimators     = 1000
+                                                                         , bootstrap        = True
+                                                                         , oob_score        = True
+                                                                         , **njobs
+                                                                         , **rs
+                                                                         , max_features     = 'auto') ) 
+              , ('Ridge Classifier'          , RidgeClassifier(**rs)  )
+              , ('Ridge ClassifierCV'        , RidgeClassifierCV(cv = 3) )          
+              , ('SVC'                       , SVC(**rs) ) 
+              , ('Stochastic GDescent'       , SGDClassifier(**rs, **njobs) )
+              , ('XGBoost'                   , XGBClassifier(**rs, verbosity = 0, use_label_encoder = False, **njobs) )
+              
              ]
                 
     mm_skf_scoresD = {}
@@ -289,10 +318,11 @@ def MultModelsCl(input_df, target
             
             
         print('\nRunning model pipeline:', model_pipeline)
-        skf_cv_modD = cross_validate(model_pipeline
+        cv_modD = cross_validate(model_pipeline
                               , input_df
                               , target
                               , cv = sel_cv
+                              #, groups = group
                               , scoring = scoring_fn
                               , return_train_score = True)
         #==============================
@@ -300,7 +330,7 @@ def MultModelsCl(input_df, target
         #==============================
         mm_skf_scoresD[model_name] = {}
         
-        for key, value in skf_cv_modD.items():
+        for key, value in cv_modD.items():
             print('\nkey:', key, '\nvalue:', value)
             print('\nmean value:', np.mean(value))
             mm_skf_scoresD[model_name][key] = round(np.mean(value),2)
@@ -308,7 +338,7 @@ def MultModelsCl(input_df, target
         # ADD more info: meta data related to input df
         mm_skf_scoresD[model_name]['resampling']        = resampling_type
         mm_skf_scoresD[model_name]['n_training_size']   = len(input_df)
-        #mm_skf_scoresD[model_name]['n_trainingY_ratio'] = round(Counter(target)[0]/Counter(target)[1], 2)
+        mm_skf_scoresD[model_name]['n_trainingY_ratio'] = round(Counter(target)[0]/Counter(target)[1], 2)
         mm_skf_scoresD[model_name]['n_features']        = len(input_df.columns)
         mm_skf_scoresD[model_name]['tts_split']         = tts_split_type
         
@@ -321,7 +351,12 @@ def MultModelsCl(input_df, target
            cmD = {}
 
             # Calculate cm         
-           y_pred   = cross_val_predict(model_pipeline, input_df, target, cv = sel_cv, **njobs)
+           y_pred   = cross_val_predict(model_pipeline
+                                        , input_df
+                                        , target
+                                        , cv = sel_cv
+                                        #, groups = group
+                                        , **njobs)
             #_tn, _fp, _fn, _tp = confusion_matrix(y_pred, y).ravel() # internally
            tn, fp, fn, tp = confusion_matrix(y_pred, target).ravel()
     
@@ -357,7 +392,7 @@ def MultModelsCl(input_df, target
            # Build bts numbers dict
            btD = {'n_blindY_neg'    : Counter(blind_test_target)[0]
                   , 'n_blindY_pos'  : Counter(blind_test_target)[1]
-                  #, 'n_testY_ratio' : round(Counter(blind_test_target)[0]/Counter(blind_test_target)[1], 2)
+                  , 'n_testY_ratio' : round(Counter(blind_test_target)[0]/Counter(blind_test_target)[1], 2)
                   , 'n_test_size'   : len(blind_test_df) }
            
            # Update cmD+tnD dicts with btD
@@ -371,9 +406,9 @@ def MultModelsCl(input_df, target
            bts_predict = model_pipeline.predict(blind_test_df)
            
            bts_mcc_score = round(matthews_corrcoef(blind_test_target, bts_predict),2)
-           print('\nMCC on Blind test:'     , bts_mcc_score)
+           print('\nMCC on Blind test:'      , bts_mcc_score)
            #print('\nAccuracy on Blind test:', round(accuracy_score(blind_test_target, bts_predict),2))
-           print('\nMCC on Training:'      , mm_skf_scoresD[model_name]['test_mcc'] )
+           print('\nMCC on Training:'        , mm_skf_scoresD[model_name]['test_mcc'] )
 
            mm_skf_scoresD[model_name]['bts_mcc']       = bts_mcc_score
            mm_skf_scoresD[model_name]['bts_fscore']    = round(f1_score(blind_test_target, bts_predict),2)
@@ -384,7 +419,7 @@ def MultModelsCl(input_df, target
            mm_skf_scoresD[model_name]['bts_jcc']       = round(jaccard_score(blind_test_target, bts_predict),2)
            #mm_skf_scoresD[model_name]['diff_mcc']      = train_test_diff_MCC
 
-           
+
     #return(mm_skf_scoresD)
     #============================
     # Process the dict to have WF
@@ -526,7 +561,8 @@ def ProcessMultModelsCl(inputD = {}, blind_test_data = True):
                 sys.exit('\nFAIL: Could not merge metadata with CV and BT dfs')
 
         else:
-            print('\nConcatenting dfs not possible [WF],check numbers ')    
+        #    print('\nConcatenting dfs not possible [WF],check numbers ')    
+            print('\nOnly combining CV and metadata')    
 
         #-------------------------------------
         # Combine WF+Metadata: Final output
diff --git a/scripts/ml/ml_functions/MultClfs_logo_skf.py b/scripts/ml/ml_functions/MultClfs_logo_skf.py
index 68eb906..1b4c2ff 100755
--- a/scripts/ml/ml_functions/MultClfs_logo_skf.py
+++ b/scripts/ml/ml_functions/MultClfs_logo_skf.py
@@ -76,7 +76,12 @@ import argparse
 import re
 import itertools
 from sklearn.model_selection import LeaveOneGroupOut
+from sklearn.decomposition import PCA
+
 #%% GLOBALS
+#rs = {'random_state': 42}
+#njobs = {'n_jobs': os.cpu_count() } # the number of jobs should equal the number of CPU cores
+
 scoring_fn =  ({ 'mcc'        : make_scorer(matthews_corrcoef)
                 , 'fscore'    : make_scorer(f1_score)
                 , 'precision' : make_scorer(precision_score)
@@ -86,7 +91,13 @@ scoring_fn =  ({ 'mcc'        : make_scorer(matthews_corrcoef)
                 , 'jcc'       : make_scorer(jaccard_score)
             }) 
   
+#skf_cv = StratifiedKFold(n_splits = 10
+#                          #, shuffle = False, random_state= None)
+#                           , shuffle = True,**rs)
 
+#rskf_cv = RepeatedStratifiedKFold(n_splits = 10
+#                                  , n_repeats = 3
+#                                  , **rs)
 
 mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)}
 jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
@@ -139,21 +150,23 @@ scoreBT_mapD = {'bts_mcc'          : 'MCC'
 def MultModelsCl_logo_skf(input_df
                        , target
                        , sel_cv
-
-                       , blind_test_df = pd.DataFrame()
-                       , blind_test_target = pd.Series(dtype = int)
-                       , tts_split_type = "none"
-                       #, group = 'none'
-
-                       , resampling_type = 'none' # default
+                       , tts_split_type
+                       , resampling_type
+                       #, group = None
+                       
                        , add_cm = True # adds confusion matrix based on cross_val_predict
                        , add_yn = True  # adds target var class numbers
                        , var_type = ['numerical', 'categorical','mixed']
+                       , scale_numeric = ['min_max', 'std', 'min_max_neg', 'none'] 
+
                        , run_blind_test = True
+                       , blind_test_df = pd.DataFrame()
+                       , blind_test_target = pd.Series(dtype = int)
                        , return_formatted_output = True
+
                        , random_state = 42
                        , n_jobs = os.cpu_count() # the number of jobs should equal the number of CPU cores
-                       , ):
+                       ):
 
     '''
     @ param input_df: input features 
@@ -165,7 +178,7 @@ def MultModelsCl_logo_skf(input_df
     @param skv_cv: stratifiedK fold int or object to allow shuffle and random state to pass
     @type: int or StratifiedKfold()
     
-    @var_type: numerical, categorical and mixed to determine what col_transform to apply (MinMaxScalar and/or one-ho    t encoder)
+    @var_type: numerical, categorical and mixed to determine what col_transform to apply (MinMaxScalar and/or one-hot encoder)
     @type: list
 
     returns
@@ -185,8 +198,8 @@ def MultModelsCl_logo_skf(input_df
                                       , **rs)
     logo = LeaveOneGroupOut()
 
-    # # select CV type:           
-    # if group == 'none':
+    # select CV type:           
+    # if group == None:
     #     sel_cv = skf_cv
     # else: 
     #     sel_cv = logo
@@ -201,52 +214,81 @@ def MultModelsCl_logo_skf(input_df
     #======================================================
     # Determine preprocessing steps ~ var_type
     #======================================================
-    if var_type == 'numerical':
-        t = [('num', MinMaxScaler(), numerical_ix)]
+   
+    # if var_type == 'numerical':
+    #     t = [('num', MinMaxScaler(), numerical_ix)]
 
+    # if var_type == 'categorical':
+    #     t = [('cat', OneHotEncoder(), categorical_ix)]
+    
+    # # if var_type == 'mixed':
+    # #     t = [('num', MinMaxScaler(), numerical_ix)
+    # #         , ('cat', OneHotEncoder(), categorical_ix) ]
+
+    # col_transform = ColumnTransformer(transformers = t
+    #                                    , remainder='passthrough')
+    
+    if type(var_type) == list: 
+        var_type = str(var_type[0])
+    else:
+        var_type = var_type
+    
+    if var_type in ['numerical','mixed']:
+        if scale_numeric == ['none']:
+            t = [('cat', OneHotEncoder(), categorical_ix)]
+        if scale_numeric != ['none']:
+            if scale_numeric == ['min_max']:
+                scaler = MinMaxScaler()
+            if scale_numeric == ['min_max_neg']:
+                scaler = MinMaxScaler(feature_range=(-1, 1))
+            if scale_numeric == ['std']:
+                scaler = StandardScaler()
+            
+            t = [('num', scaler, numerical_ix)
+              , ('cat', OneHotEncoder(), categorical_ix)]
+        
+                
     if var_type == 'categorical':
         t = [('cat', OneHotEncoder(), categorical_ix)]
-    
-    if var_type == 'mixed':
-        t = [('num', MinMaxScaler(), numerical_ix)
-            , ('cat', OneHotEncoder(), categorical_ix) ]
+
         
     col_transform = ColumnTransformer(transformers = t
                                        , remainder='passthrough')
     
+    
     #======================================================
     # Specify multiple Classification Models  
     #======================================================
-    models = [('AdaBoost Classifier'          , AdaBoostClassifier(**rs) )
-               , ('Bagging Classifier'        , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True) )
-               , ('Decision Tree'             , DecisionTreeClassifier(**rs) ) 
-               , ('Extra Tree'                , ExtraTreeClassifier(**rs) )
-               , ('Extra Trees'               , ExtraTreesClassifier(**rs) ) 
-               , ('Gradient Boosting'         , GradientBoostingClassifier(**rs) )
-               , ('Gaussian NB'               , GaussianNB() )
-               , ('Gaussian Process'          , GaussianProcessClassifier(**rs) )
-               , ('K-Nearest Neighbors'       , KNeighborsClassifier() ) 
-               , ('LDA'                       , LinearDiscriminantAnalysis() )
-               , ('Logistic Regression'       , LogisticRegression(**rs) )
-               , ('Logistic RegressionCV'     , LogisticRegressionCV(cv = 3, **rs))
-               , ('MLP'                       , MLPClassifier(max_iter = 500, **rs) ) 
-               , ('Multinomial'               , MultinomialNB() )
-               , ('Naive Bayes'               , BernoulliNB() )
-               , ('Passive Aggresive'         , PassiveAggressiveClassifier(**rs, **njobs) )
-               , ('QDA'                       , QuadraticDiscriminantAnalysis() )
-               , ('Random Forest'             , RandomForestClassifier(**rs, n_estimators = 1000 ) ) 
-               , ('Random Forest2'            , RandomForestClassifier(min_samples_leaf = 5
-                                                                       , n_estimators     = 1000
-                                                                       , bootstrap        = True
-                                                                       , oob_score        = True
-                                                                       , **njobs
-                                                                       , **rs
-                                                                       , max_features     = 'auto') ) 
-                , ('Ridge Classifier'          , RidgeClassifier(**rs)  )
-                , ('Ridge ClassifierCV'        , RidgeClassifierCV(cv = 3) )          
-                , ('SVC'                       , SVC(**rs) ) 
-                , ('Stochastic GDescent'       , SGDClassifier(**rs, **njobs) )
-                , ('XGBoost'                   , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False) )
+    models = [('AdaBoost Classifier'            , AdaBoostClassifier(**rs) )
+               , ('Bagging Classifier'          , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True, verbose = 3, n_estimators = 100) )
+               # , ('Decision Tree'             , DecisionTreeClassifier(**rs) ) 
+               # , ('Extra Tree'                , ExtraTreeClassifier(**rs) )
+               # , ('Extra Trees'               , ExtraTreesClassifier(**rs) ) 
+               # , ('Gradient Boosting'         , GradientBoostingClassifier(**rs) )
+               # , ('Gaussian NB'               , GaussianNB() )
+               # , ('Gaussian Process'          , GaussianProcessClassifier(**rs) )
+               # , ('K-Nearest Neighbors'       , KNeighborsClassifier() ) 
+               # , ('LDA'                       , LinearDiscriminantAnalysis() )
+               # , ('Logistic Regression'       , LogisticRegression(**rs) )
+               # , ('Logistic RegressionCV'     , LogisticRegressionCV(cv = 3, **rs))
+               # , ('MLP'                       , MLPClassifier(max_iter = 500, **rs) ) 
+               # , ('Multinomial'               , MultinomialNB() )
+               # , ('Naive Bayes'               , BernoulliNB() )
+               # , ('Passive Aggresive'         , PassiveAggressiveClassifier(**rs, **njobs) )
+               # , ('QDA'                       , QuadraticDiscriminantAnalysis() )
+               # , ('Random Forest'             , RandomForestClassifier(**rs, n_estimators = 1000, **njobs ) ) 
+               # , ('Random Forest2'            , RandomForestClassifier(min_samples_leaf = 5
+               #                                                         , n_estimators     = 1000
+               #                                                         , bootstrap        = True
+               #                                                         , oob_score        = True
+               #                                                         , **njobs
+               #                                                         , **rs
+               #                                                         , max_features     = 'auto') ) 
+               #  , ('Ridge Classifier'          , RidgeClassifier(**rs)  )
+               #  , ('Ridge ClassifierCV'        , RidgeClassifierCV(cv = 3) )          
+               #  , ('SVC'                       , SVC(**rs) ) 
+               #  , ('Stochastic GDescent'       , SGDClassifier(**rs, **njobs) )
+               #  , ('XGBoost'                   , XGBClassifier(**rs, verbosity = 0, use_label_encoder = False, **njobs) )
              ]
                 
     mm_skf_scoresD = {}
@@ -268,6 +310,12 @@ def MultModelsCl_logo_skf(input_df
         model_pipeline = Pipeline([
             ('prep'     , col_transform)
             , ('model'  , model_fn)])
+        
+        # model_pipeline = Pipeline([
+        #     ('prep'     , col_transform)
+        #     ,  ('pca'   , PCA(n_components = 2))
+        #     , ('model'  , model_fn)])
+            
             
         print('\nRunning model pipeline:', model_pipeline)
         cv_modD = cross_validate(model_pipeline
@@ -358,9 +406,10 @@ def MultModelsCl_logo_skf(input_df
            bts_predict = model_pipeline.predict(blind_test_df)
            
            bts_mcc_score = round(matthews_corrcoef(blind_test_target, bts_predict),2)
-           print('\nMCC on Blind test:'     , bts_mcc_score)
-           print('\nAccuracy on Blind test:', round(accuracy_score(blind_test_target, bts_predict),2))
-           
+           print('\nMCC on Blind test:'      , bts_mcc_score)
+           #print('\nAccuracy on Blind test:', round(accuracy_score(blind_test_target, bts_predict),2))
+           print('\nMCC on Training:'        , mm_skf_scoresD[model_name]['test_mcc'] )
+
            mm_skf_scoresD[model_name]['bts_mcc']       = bts_mcc_score
            mm_skf_scoresD[model_name]['bts_fscore']    = round(f1_score(blind_test_target, bts_predict),2)
            mm_skf_scoresD[model_name]['bts_precision'] = round(precision_score(blind_test_target, bts_predict),2)
@@ -387,8 +436,7 @@ def MultModelsCl_logo_skf(input_df
 ############################
 #Processes the dict from above if use_formatted_output = True 
 
-def ProcessMultModelsCl(inputD = {}
-                        , blind_test_data = True):
+def ProcessMultModelsCl(inputD = {}, blind_test_data = True):
     
     scoresDF = pd.DataFrame(inputD)
     
diff --git a/scripts/ml/ml_functions/test_func_singlegene.py b/scripts/ml/ml_functions/test_func_singlegene.py
index 6abccb4..729fafe 100644
--- a/scripts/ml/ml_functions/test_func_singlegene.py
+++ b/scripts/ml/ml_functions/test_func_singlegene.py
@@ -26,7 +26,7 @@ skf_cv = StratifiedKFold(n_splits = 10
 #                                   , n_repeats = 3
 #                                   , **rs)
 # param dict for getmldata()
-gene_model_paramD = {'data_combined_model'   : False
+gene_model_paramD = {'data_combined_model'       : False
                     , 'use_or'                   : False
                     , 'omit_all_genomic_features': False
                     , 'write_maskfile'           : False
@@ -77,7 +77,7 @@ fooD = MultModelsCl(input_df = df2['X_ros']
                 , blind_test_df =  df2['X_bts']
                 , blind_test_target =  df2['y_bts']
                 , tts_split_type  = spl_type
-                , resampling_type = 'none' # default
+                , resampling_type = 'XXXX' # default
                 , var_type = ['mixed']
                 , scale_numeric = ['min_max']
                 , return_formatted_output = False
diff --git a/scripts/ml/ml_iterator.py b/scripts/ml/ml_iterator.py
index 7f0aafb..8ebb88c 100755
--- a/scripts/ml/ml_iterator.py
+++ b/scripts/ml/ml_iterator.py
@@ -93,6 +93,7 @@ for gene, drug in ml_gene_drugD.items():
                                     , sel_cv = skf_cv
                                     , blind_test_df =  tempD['X_bts']
                                     , blind_test_target = tempD['y_bts']
+                                    , scale_numeric = ['min_max']
                                     , add_cm = True 
                                     , add_yn = True
                                     , return_formatted_output = True)
@@ -103,5 +104,5 @@ for gene, drug in ml_gene_drugD.items():
                 out_wf= pd.concat(mmDD, ignore_index = True)
             
             out_wf_f = out_wf.sort_values(by = ['resampling', 'source_data', 'MCC'], ascending = [True, True, False], inplace = False)
-            out_wf_f.to_csv(('/home/tanu/git/Data/ml_combined/genes/'+ out_filename), index = False)
+            out_wf_f.to_csv(out_filename, index = False)