added dummy classifier to models

2022-07-27 17:10:04 +01:00 · 2022-07-27 17:10:04 +01:00 · 744bc8f4a1
commit 744bc8f4a1
parent c32005c99c
4 changed files with 94 additions and 53 deletions
--- a/scripts/ml/dummy_classifier.py
+++ b/scripts/ml/dummy_classifier.py
@ -62,6 +62,7 @@ X.columns

 y = df_clean.iloc[:,171] # dst
 y.value_counts()
+#########################

 y2 = df_clean.iloc[:,172] #dst_mode
 y2.value_counts()
@ -107,3 +108,34 @@ acccuracy:
 TP+TN/TP+TN+FP+FN

 114/71
+
+######################################
+# try with CV
+
+X_eg = np.array([-1, 1, 1, 1, -2, 9, 4, 4, 1, -1, 3, 0])
+y_eg = np.array([0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1])
+dummy_clf = DummyClassifier(strategy="most_frequent")
+dummy_clf.fit(X_eg, y_eg)
+#DummyClassifier(strategy='most_frequent')
+dummy_clf.predict(X_eg)
+#dummy_clf.predict(np.array([1,1,1,1,1,1,1,1,1,1]))
+#dummy_clf.predict_proba(X_eg)
+
+dummy_clf.score(X_eg, y_eg)
+
+
+cv_DummyD = cross_validate(dummy_clf
+                      , X_eg
+                      , y_eg
+                      , cv = 5
+                      #, groups = group
+                      , scoring = scoring_fn
+                      , return_train_score = True)
+
+cv_dummyD_ALL= {}
+cv_dummyD_ALL['DUMMY'] = {}
+
+for key, value in cv_DummyD.items():
+    print('\nkey:', key, '\nvalue:', value)
+    print('\nmean value:', np.mean(value))
+    cv_dummyD_ALL['DUMMY'][key] = round(np.mean(value),2)
--- a/scripts/ml/ml_functions/MultClfs.py
+++ b/scripts/ml/ml_functions/MultClfs.py
@ -78,9 +78,10 @@ import itertools
 from sklearn.model_selection import LeaveOneGroupOut
 from sklearn.decomposition import PCA
 from sklearn.naive_bayes import ComplementNB
+from sklearn.dummy import DummyClassifier

 #%% GLOBALS
-#rs = {'random_state': 42}
+#rs = {'random_state': 42} # INSIDE FUNCTION CALL NOW
 #njobs = {'n_jobs': os.cpu_count() } # the number of jobs should equal the number of CPU cores

 scoring_fn =  ({ 'mcc'        : make_scorer(matthews_corrcoef)
@ -261,37 +262,36 @@ def MultModelsCl(input_df, target
    #======================================================
    models = [('AdaBoost Classifier'         , AdaBoostClassifier(**rs) )
              , ('Bagging Classifier'        , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True, verbose = 3, n_estimators = 100) )
-              #, ('Bernoulli NB'               , BernoulliNB() ) # pks Naive Bayes, CAUTION
-              , ('Complement NB'             , ComplementNB() )
-              , ('Decision Tree'             , DecisionTreeClassifier(**rs) ) 
-              , ('Extra Tree'                , ExtraTreeClassifier(**rs) )
-              , ('Extra Trees'               , ExtraTreesClassifier(**rs) ) 
-              , ('Gradient Boosting'         , GradientBoostingClassifier(**rs) )
-              , ('Gaussian NB'               , GaussianNB() )
-              , ('Gaussian Process'          , GaussianProcessClassifier(**rs) )
-              , ('K-Nearest Neighbors'       , KNeighborsClassifier() ) 
-              , ('LDA'                       , LinearDiscriminantAnalysis() )
-              , ('Logistic Regression'       , LogisticRegression(**rs) )
-              , ('Logistic RegressionCV'     , LogisticRegressionCV(cv = 3, **rs))
-              , ('MLP'                       , MLPClassifier(max_iter = 500, **rs) ) 
-              , ('Multinomial NB'               , MultinomialNB() )
-
-              , ('Passive Aggresive'         , PassiveAggressiveClassifier(**rs, **njobs) )
-              , ('QDA'                       , QuadraticDiscriminantAnalysis() )
-              , ('Random Forest'             , RandomForestClassifier(**rs, n_estimators = 1000, **njobs ) ) 
-              , ('Random Forest2'            , RandomForestClassifier(min_samples_leaf = 5
-                                                                         , n_estimators     = 1000
-                                                                         , bootstrap        = True
-                                                                         , oob_score        = True
-                                                                         , **njobs
-                                                                         , **rs
-                                                                         , max_features     = 'auto') ) 
-              , ('Ridge Classifier'          , RidgeClassifier(**rs)  )
-              , ('Ridge ClassifierCV'        , RidgeClassifierCV(cv = 3) )          
-              , ('SVC'                       , SVC(**rs) ) 
-              , ('Stochastic GDescent'       , SGDClassifier(**rs, **njobs) )
-              , ('XGBoost'                   , XGBClassifier(**rs, verbosity = 0, use_label_encoder = False, **njobs) )
-              
+               #, ('Bernoulli NB'               , BernoulliNB() ) # pks Naive Bayes, CAUTION
+               , ('Complement NB'             , ComplementNB() )
+               , ('Decision Tree'             , DecisionTreeClassifier(**rs) ) 
+               , ('Extra Tree'                , ExtraTreeClassifier(**rs) )
+               , ('Extra Trees'               , ExtraTreesClassifier(**rs) ) 
+               , ('Gradient Boosting'         , GradientBoostingClassifier(**rs) )
+               , ('Gaussian NB'               , GaussianNB() )
+               , ('Gaussian Process'          , GaussianProcessClassifier(**rs) )
+               , ('K-Nearest Neighbors'       , KNeighborsClassifier() ) 
+               , ('LDA'                       , LinearDiscriminantAnalysis() )
+               , ('Logistic Regression'       , LogisticRegression(**rs) )
+               , ('Logistic RegressionCV'     , LogisticRegressionCV(cv = 3, **rs))
+               , ('MLP'                       , MLPClassifier(max_iter = 500, **rs) ) 
+               , ('Multinomial NB'            , MultinomialNB() )
+               , ('Passive Aggresive'         , PassiveAggressiveClassifier(**rs, **njobs) )
+               , ('QDA'                       , QuadraticDiscriminantAnalysis() )
+               , ('Random Forest'             , RandomForestClassifier(**rs, n_estimators = 1000, **njobs ) ) 
+               , ('Random Forest2'            , RandomForestClassifier(min_samples_leaf = 5
+                                                                          , n_estimators     = 1000
+                                                                          , bootstrap        = True
+                                                                          , oob_score        = True
+                                                                          , **njobs
+                                                                          , **rs
+                                                                          , max_features     = 'auto') ) 
+               , ('Ridge Classifier'          , RidgeClassifier(**rs)  )
+               , ('Ridge ClassifierCV'        , RidgeClassifierCV(cv = 3) )          
+               , ('SVC'                       , SVC(**rs) ) 
+               , ('Stochastic GDescent'       , SGDClassifier(**rs, **njobs) )
+               , ('XGBoost'                   , XGBClassifier(**rs, verbosity = 0, use_label_encoder = False, **njobs) )
+               , ('Dummy Classifier'          , DummyClassifier(strategy = 'most_frequent') )
             ]
                
    mm_skf_scoresD = {}
--- a/scripts/ml/ml_functions/test_func_singlegene.py
+++ b/scripts/ml/ml_functions/test_func_singlegene.py
@ -14,10 +14,11 @@ sys.path
 # import
 from GetMLData import *
 from SplitTTS import *
-#from MultClfs import *
-from MultClfs_SIMPLE import *
+from MultClfs import *
+#from MultClfs_SIMPLE import *

 #%%
+rs = {'random_state': 42}
 skf_cv = StratifiedKFold(n_splits = 10
                            , shuffle = True,**rs)
 #sel_cv = logo
@ -28,12 +29,12 @@ skf_cv = StratifiedKFold(n_splits = 10
 gene_model_paramD = {'data_combined_model'       : False
                    , 'use_or'                   : False
                    , 'omit_all_genomic_features': False
-                    , 'write_maskfile'           : True
+                    , 'write_maskfile'           : False
                    , 'write_outfile'            : False }

 #df = getmldata(gene, drug, **gene_model_paramD)
-df = getmldata('pncA', 'pyrazinamide', **gene_model_paramD)
-#df = getmldata('embB', 'ethambutol'   , **gene_model_paramD)
+#df = getmldata('pncA', 'pyrazinamide', **gene_model_paramD)
+df = getmldata('embB', 'ethambutol'   , **gene_model_paramD)
 #df = getmldata('katG', 'isoniazid'    , **gene_model_paramD)
 #df = getmldata('rpoB', 'rifampicin'   , **gene_model_paramD)
 #df  = getmldata('gid' , 'streptomycin' , **gene_model_paramD)
@ -68,9 +69,8 @@ len(df)
 Counter(df2['y'])
 Counter(df2['y_bts'])

-
-fooD = MultModelsCl(input_df = df2['X_ros']
-                , target = df2['y_ros']
+fooD = MultModelsCl(input_df = df2['X']
+                , target = df2['y']
                , sel_cv = skf_cv
                , run_blind_test = True
                , blind_test_df =  df2['X_bts']
@ -88,6 +88,11 @@ for k, v in fooD.items():
          , '\nBTS MCC:' , fooD[k]['bts_mcc']
          , '\nDIFF:',fooD[k]['bts_mcc'] - fooD[k]['test_mcc'] )

+for k, v in fooD.items():
+    print('\nModel:', k
+          , '\nTRAIN ACCURACY:', fooD[k]['test_accuracy']
+          , '\nBTS ACCURACY:' , fooD[k]['bts_accuracy']
+          , '\nDIFF:',fooD[k]['bts_accuracy'] - fooD[k]['test_accuracy'] )
 #%% CHECK SCALING
 embb_df = getmldata('embB', 'ethambutol'   , **combined_model_paramD)
 all(embb_df.columns.isin(['gene_name'])) # should be False
--- a/scripts/ml/ml_iterator.py
+++ b/scripts/ml/ml_iterator.py
@ -82,22 +82,26 @@ for gene, drug in ml_gene_drugD.items():
                                        , 'target'         : tempD['y']
                                        , 'var_type'       : 'mixed'
                                        , 'resampling_type': 'none'}
-                    , 'smnc_paramD': { 'input_df'          : tempD['X_smnc']
-                                      , 'target'           : tempD['y_smnc']
+                    
+                    , 'smnc_paramD'  : { 'input_df'        : tempD['X_smnc']
+                                       , 'target'          : tempD['y_smnc']
+                                       , 'var_type'        : 'mixed'
+                                       , 'resampling_type' : 'smnc'}
+                    
+                    , 'ros_paramD'   : { 'input_df'        : tempD['X_ros']
+                                      , 'target'           : tempD['y_ros']
                                      , 'var_type'         : 'mixed'
-                                      , 'resampling_type'  : 'smnc'}
-                    , 'ros_paramD': { 'input_df'           : tempD['X_ros']
-                                    , 'target'             : tempD['y_ros']
-                                    , 'var_type'           : 'mixed'
-                                    , 'resampling_type'    : 'ros'}
-                    , 'rus_paramD' : { 'input_df'          : tempD['X_rus']
+                                      , 'resampling_type'  : 'ros'}
+                    
+                    , 'rus_paramD'   : { 'input_df'        : tempD['X_rus']
                                      , 'target'           : tempD['y_rus']
                                      , 'var_type'         : 'mixed'
                                      , 'resampling_type'  : 'rus'}
-                    , 'rouC_paramD' : { 'input_df'         : tempD['X_rouC']
-                                        , 'target'         : tempD['y_rouC']
-                                        , 'var_type'       : 'mixed'
-                                        , 'resampling_type': 'rouC'}
+                    
+                    , 'rouC_paramD'  : { 'input_df'        : tempD['X_rouC']
+                                      , 'target'           : tempD['y_rouC']
+                                      , 'var_type'         : 'mixed'
+                                      , 'resampling_type'  : 'rouC'}
                    }
            
            mmDD = {}