added cm_logo_skf.py and placeholder for splits

2022-07-01 13:55:12 +01:00 · 2022-07-01 13:55:12 +01:00 · d812835713
commit d812835713
parent 952cfeb4c0
4 changed files with 254 additions and 49 deletions
--- a/scripts/ml/ml_functions/MultClfs_logo_skf.py
+++ b/scripts/ml/ml_functions/MultClfs_logo_skf.py
@ -89,14 +89,7 @@ scoring_fn =  ({ 'mcc'        : make_scorer(matthews_corrcoef)
                , 'jcc'       : make_scorer(jaccard_score)
            }) 
  
-skf_cv = StratifiedKFold(n_splits = 10
-                          #, shuffle = False, random_state= None)
-                          , shuffle = True,**rs)

-rskf_cv = RepeatedStratifiedKFold(n_splits = 10
-                                  , n_repeats = 3
-                                  , **rs)
-logo = LeaveOneGroupOut()

 mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)}
 jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
@ -160,7 +153,10 @@ def MultModelsCl_logo_skf(input_df
                       , add_yn = True  # adds target var class numbers
                       , var_type = ['numerical', 'categorical','mixed']
                       , run_blind_test = True
-                       , return_formatted_output = True):
+                       , return_formatted_output = True
+                       , random_state = 42
+                       , n_jobs = 10
+                       , ):

    '''
    @ param input_df: input features 
@ -179,10 +175,24 @@ def MultModelsCl_logo_skf(input_df
    Dict containing multiple classification scores for each model and mean of each Stratified Kfold including training
    '''
    
-    # if group == 'none':
-    #     sel_cv = skf_cv
-    # else: 
-    #     group = 'none'
+#%% Func globals        
+    rs = {'random_state': random_state}
+    njobs = {'n_jobs': n_jobs}
+    
+    skf_cv = StratifiedKFold(n_splits = 10
+                              #, shuffle = False, random_state= None)
+                              , shuffle = True,**rs)
+
+    rskf_cv = RepeatedStratifiedKFold(n_splits = 10
+                                      , n_repeats = 3
+                                      , **rs)
+    logo = LeaveOneGroupOut()
+
+    # select CV type:           
+    if group == 'none':
+        sel_cv = skf_cv
+    else: 
+        sel_cv = logo
    #======================================================
    # Determine categorical and numerical features
    #======================================================
@ -210,7 +220,7 @@ def MultModelsCl_logo_skf(input_df
    #======================================================
    # Specify multiple Classification Models  
    #======================================================
-    models = [('AdaBoost Classifier'     , AdaBoostClassifier(**rs) )
+    models = [('AdaBoost Classifier'          , AdaBoostClassifier(**rs) )
               , ('Bagging Classifier'        , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True) )
               , ('Decision Tree'             , DecisionTreeClassifier(**rs) ) 
               , ('Extra Tree'                , ExtraTreeClassifier(**rs) )
--- a/scripts/ml/ml_functions/ml_data_combined.py
+++ b/scripts/ml/ml_functions/ml_data_combined.py
@ -63,40 +63,8 @@ else:
          , '\nGot:',  len(common_cols))

 colnames_combined_df = combined_df.columns
+if 'gene_name' in colnames_combined_df:
+    print("\nGene name included")
+else:
+    ('\nGene name NOT included')
 ##############################################################################
-
-#%% split_tts(): func params
-# (ml_input_data
-#   , data_type      = ['actual', 'complete']
-#   , split_type     = ['70_30', '80_20', 'sl']
-#   , oversampling   = True
-#   , dst_colname    = 'dst'# determine how to subset the actual vs reverse data
-#   , target_colname = 'dst_mode'
-#   , include_gene_name = True
-#   , k_smote = 5)
-#%% split data into different data types
-# #===================
-# #     70/30
-# #=================== 
-# # actual
-# tts_7030_paramD = {'data_type'    : 'actual'
-#               , 'split_type'      : '70_30'}
-
-# # complete
-# tts_cd_7030_paramD = {'data_type'  : 'complete'
-#               , 'split_type'      : '70_30'}
-
-# # call split_tts()                   
-# data_CM_7030D = split_tts(ml_input_data = combined_df
-#           , **tts_7030_paramD
-#           , oversampling = True
-#           , dst_colname = 'dst'
-#           , target_colname = 'dst_mode'
-#           , include_gene_name = False) # when not doing leave one group out  
-
-# data_cd_CM_7030D = split_tts(ml_input_data = combined_df
-#           , **tts_cd_7030_paramD
-#           , oversampling = True
-#           , dst_colname = 'dst'
-#           , target_colname = 'dst_mode'
-#           , include_gene_name = False)