added dummy classifier to models

2022-07-27 17:10:04 +01:00 · 2022-07-27 17:10:04 +01:00 · 744bc8f4a1
commit 744bc8f4a1
parent c32005c99c
4 changed files with 94 additions and 53 deletions
--- a/scripts/ml/ml_functions/MultClfs.py
+++ b/scripts/ml/ml_functions/MultClfs.py
@ -78,9 +78,10 @@ import itertools
 from sklearn.model_selection import LeaveOneGroupOut
 from sklearn.decomposition import PCA
 from sklearn.naive_bayes import ComplementNB
+from sklearn.dummy import DummyClassifier

 #%% GLOBALS
-#rs = {'random_state': 42}
+#rs = {'random_state': 42} # INSIDE FUNCTION CALL NOW
 #njobs = {'n_jobs': os.cpu_count() } # the number of jobs should equal the number of CPU cores

 scoring_fn =  ({ 'mcc'        : make_scorer(matthews_corrcoef)
@ -261,37 +262,36 @@ def MultModelsCl(input_df, target
    #======================================================
    models = [('AdaBoost Classifier'         , AdaBoostClassifier(**rs) )
              , ('Bagging Classifier'        , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True, verbose = 3, n_estimators = 100) )
-              #, ('Bernoulli NB'               , BernoulliNB() ) # pks Naive Bayes, CAUTION
-              , ('Complement NB'             , ComplementNB() )
-              , ('Decision Tree'             , DecisionTreeClassifier(**rs) ) 
-              , ('Extra Tree'                , ExtraTreeClassifier(**rs) )
-              , ('Extra Trees'               , ExtraTreesClassifier(**rs) ) 
-              , ('Gradient Boosting'         , GradientBoostingClassifier(**rs) )
-              , ('Gaussian NB'               , GaussianNB() )
-              , ('Gaussian Process'          , GaussianProcessClassifier(**rs) )
-              , ('K-Nearest Neighbors'       , KNeighborsClassifier() ) 
-              , ('LDA'                       , LinearDiscriminantAnalysis() )
-              , ('Logistic Regression'       , LogisticRegression(**rs) )
-              , ('Logistic RegressionCV'     , LogisticRegressionCV(cv = 3, **rs))
-              , ('MLP'                       , MLPClassifier(max_iter = 500, **rs) ) 
-              , ('Multinomial NB'               , MultinomialNB() )
-
-              , ('Passive Aggresive'         , PassiveAggressiveClassifier(**rs, **njobs) )
-              , ('QDA'                       , QuadraticDiscriminantAnalysis() )
-              , ('Random Forest'             , RandomForestClassifier(**rs, n_estimators = 1000, **njobs ) ) 
-              , ('Random Forest2'            , RandomForestClassifier(min_samples_leaf = 5
-                                                                         , n_estimators     = 1000
-                                                                         , bootstrap        = True
-                                                                         , oob_score        = True
-                                                                         , **njobs
-                                                                         , **rs
-                                                                         , max_features     = 'auto') ) 
-              , ('Ridge Classifier'          , RidgeClassifier(**rs)  )
-              , ('Ridge ClassifierCV'        , RidgeClassifierCV(cv = 3) )          
-              , ('SVC'                       , SVC(**rs) ) 
-              , ('Stochastic GDescent'       , SGDClassifier(**rs, **njobs) )
-              , ('XGBoost'                   , XGBClassifier(**rs, verbosity = 0, use_label_encoder = False, **njobs) )
-              
+               #, ('Bernoulli NB'               , BernoulliNB() ) # pks Naive Bayes, CAUTION
+               , ('Complement NB'             , ComplementNB() )
+               , ('Decision Tree'             , DecisionTreeClassifier(**rs) ) 
+               , ('Extra Tree'                , ExtraTreeClassifier(**rs) )
+               , ('Extra Trees'               , ExtraTreesClassifier(**rs) ) 
+               , ('Gradient Boosting'         , GradientBoostingClassifier(**rs) )
+               , ('Gaussian NB'               , GaussianNB() )
+               , ('Gaussian Process'          , GaussianProcessClassifier(**rs) )
+               , ('K-Nearest Neighbors'       , KNeighborsClassifier() ) 
+               , ('LDA'                       , LinearDiscriminantAnalysis() )
+               , ('Logistic Regression'       , LogisticRegression(**rs) )
+               , ('Logistic RegressionCV'     , LogisticRegressionCV(cv = 3, **rs))
+               , ('MLP'                       , MLPClassifier(max_iter = 500, **rs) ) 
+               , ('Multinomial NB'            , MultinomialNB() )
+               , ('Passive Aggresive'         , PassiveAggressiveClassifier(**rs, **njobs) )
+               , ('QDA'                       , QuadraticDiscriminantAnalysis() )
+               , ('Random Forest'             , RandomForestClassifier(**rs, n_estimators = 1000, **njobs ) ) 
+               , ('Random Forest2'            , RandomForestClassifier(min_samples_leaf = 5
+                                                                          , n_estimators     = 1000
+                                                                          , bootstrap        = True
+                                                                          , oob_score        = True
+                                                                          , **njobs
+                                                                          , **rs
+                                                                          , max_features     = 'auto') ) 
+               , ('Ridge Classifier'          , RidgeClassifier(**rs)  )
+               , ('Ridge ClassifierCV'        , RidgeClassifierCV(cv = 3) )          
+               , ('SVC'                       , SVC(**rs) ) 
+               , ('Stochastic GDescent'       , SGDClassifier(**rs, **njobs) )
+               , ('XGBoost'                   , XGBClassifier(**rs, verbosity = 0, use_label_encoder = False, **njobs) )
+               , ('Dummy Classifier'          , DummyClassifier(strategy = 'most_frequent') )
             ]
                
    mm_skf_scoresD = {}
--- a/scripts/ml/ml_functions/test_func_singlegene.py
+++ b/scripts/ml/ml_functions/test_func_singlegene.py
@ -14,10 +14,11 @@ sys.path
 # import
 from GetMLData import *
 from SplitTTS import *
-#from MultClfs import *
-from MultClfs_SIMPLE import *
+from MultClfs import *
+#from MultClfs_SIMPLE import *

 #%%
+rs = {'random_state': 42}
 skf_cv = StratifiedKFold(n_splits = 10
                            , shuffle = True,**rs)
 #sel_cv = logo
@ -28,12 +29,12 @@ skf_cv = StratifiedKFold(n_splits = 10
 gene_model_paramD = {'data_combined_model'       : False
                    , 'use_or'                   : False
                    , 'omit_all_genomic_features': False
-                    , 'write_maskfile'           : True
+                    , 'write_maskfile'           : False
                    , 'write_outfile'            : False }

 #df = getmldata(gene, drug, **gene_model_paramD)
-df = getmldata('pncA', 'pyrazinamide', **gene_model_paramD)
-#df = getmldata('embB', 'ethambutol'   , **gene_model_paramD)
+#df = getmldata('pncA', 'pyrazinamide', **gene_model_paramD)
+df = getmldata('embB', 'ethambutol'   , **gene_model_paramD)
 #df = getmldata('katG', 'isoniazid'    , **gene_model_paramD)
 #df = getmldata('rpoB', 'rifampicin'   , **gene_model_paramD)
 #df  = getmldata('gid' , 'streptomycin' , **gene_model_paramD)
@ -68,9 +69,8 @@ len(df)
 Counter(df2['y'])
 Counter(df2['y_bts'])

-
-fooD = MultModelsCl(input_df = df2['X_ros']
-                , target = df2['y_ros']
+fooD = MultModelsCl(input_df = df2['X']
+                , target = df2['y']
                , sel_cv = skf_cv
                , run_blind_test = True
                , blind_test_df =  df2['X_bts']
@ -87,7 +87,12 @@ for k, v in fooD.items():
          , '\nTRAIN MCC:', fooD[k]['test_mcc']
          , '\nBTS MCC:' , fooD[k]['bts_mcc']
          , '\nDIFF:',fooD[k]['bts_mcc'] - fooD[k]['test_mcc'] )
-    
+
+for k, v in fooD.items():
+    print('\nModel:', k
+          , '\nTRAIN ACCURACY:', fooD[k]['test_accuracy']
+          , '\nBTS ACCURACY:' , fooD[k]['bts_accuracy']
+          , '\nDIFF:',fooD[k]['bts_accuracy'] - fooD[k]['test_accuracy'] )
 #%% CHECK SCALING
 embb_df = getmldata('embB', 'ethambutol'   , **combined_model_paramD)
 all(embb_df.columns.isin(['gene_name'])) # should be False