saving work

2022-07-05 16:06:03 +01:00 · 2022-07-05 16:06:03 +01:00 · 79cb89a019
commit 79cb89a019
parent 652cf4802e
2 changed files with 39 additions and 56 deletions
--- a/scripts/ml/ml_functions/MultClfs.py
+++ b/scripts/ml/ml_functions/MultClfs.py
@ -227,7 +227,7 @@ def MultModelsCl(input_df, target, skf_cv
                , ('Ridge ClassifierCV'        , RidgeClassifierCV(cv = 3) )          
                , ('SVC'                       , SVC(**rs) ) 
                , ('Stochastic GDescent'       , SGDClassifier(**rs, **njobs) )
-                , ('XGBoost'                   , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False) )
+                , ('XGBoost'                   , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False, **njobs) )
             ]
    mm_skf_scoresD = {}
--- a/scripts/ml/ml_functions/MultClfs_fi.py
+++ b/scripts/ml/ml_functions/MultClfs_fi.py
@ -74,6 +74,8 @@ from sklearn.impute import KNNImputer as KNN
 import json
 import argparse
 import re
 from sklearn.model_selection import LeaveOneGroupOut
 #%% GLOBALS
 rs = {'random_state': 42}
 njobs = {'n_jobs': os.cpu_count() } # the number of jobs should equal the number of CPU cores
@ -95,6 +97,9 @@ rskf_cv = RepeatedStratifiedKFold(n_splits = 10
                                  , n_repeats = 3
                                  , **rs)
 logo = LeaveOneGroupOut()
 mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)}
 jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
 ###############################################################################
@ -116,7 +121,7 @@ def remove(string):
 # Run Multiple Classifiers
 ############################
 # Multiple Classification - Model Pipeline
-def XGBClf(input_df, target, sel_cv
+def MultClfs_fi(input_df, target, sel_cv
                       , blind_test_df
                       , blind_test_target
                       , tts_split_type 
@ -175,9 +180,37 @@ def XGBClf(input_df, target, sel_cv
    #======================================================
    # Specify multiple Classification Models  
    #======================================================
-    models = [ ('XGBoost' , XGBClassifier(**rs, verbosity = 3, use_label_encoder = False, **njobs) ) 
+    models = [('AdaBoost Classifier'     , AdaBoostClassifier(**rs) )
-              , ( 'Random Forest', RandomForestClassifier(**rs, **njobs, n_estimators = 1000))
+               # , ('Bagging Classifier'        , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True) )
-              , ('Logistic Regression', LogisticRegression(**rs))]    
+               #  , ('Decision Tree'             , DecisionTreeClassifier(**rs) ) 
               #  , ('Extra Tree'                , ExtraTreeClassifier(**rs) )
               #  , ('Extra Trees'               , ExtraTreesClassifier(**rs) ) 
               #  , ('Gradient Boosting'         , GradientBoostingClassifier(**rs) )
               #  , ('Gaussian NB'               , GaussianNB() )
               #  , ('Gaussian Process'          , GaussianProcessClassifier(**rs) )
               #  , ('K-Nearest Neighbors'       , KNeighborsClassifier() ) 
               #  , ('LDA'                       , LinearDiscriminantAnalysis() )
               #  , ('Logistic Regression'       , LogisticRegression(**rs) )
               #  , ('Logistic RegressionCV'     , LogisticRegressionCV(cv = 3, **rs))
               #  , ('MLP'                       , MLPClassifier(max_iter = 500, **rs) ) 
               #  , ('Multinomial'               , MultinomialNB() )
               #  , ('Naive Bayes'               , BernoulliNB() )
               #  , ('Passive Aggresive'         , PassiveAggressiveClassifier(**rs, **njobs) )
               #  , ('QDA'                       , QuadraticDiscriminantAnalysis() )
               #  , ('Random Forest'             , RandomForestClassifier(**rs, n_estimators = 1000, **njobs ) ) 
               #  # , ('Random Forest2'            , RandomForestClassifier(min_samples_leaf = 5
               #  #                                                         , n_estimators     = 1000
               #  #                                                         , bootstrap        = True
               #  #                                                         , oob_score        = True
               #  #                                                         , **njobs
               #  #                                                         , **rs
               #  #                                                         , max_features     = 'auto') ) 
               #   , ('Ridge Classifier'          , RidgeClassifier(**rs)  )
               #   , ('Ridge ClassifierCV'        , RidgeClassifierCV(cv = 3) )          
               #   , ('SVC'                       , SVC(**rs) ) 
                , ('Stochastic GDescent'       , SGDClassifier(**rs, **njobs) )
                , ('XGBoost'                   , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False, **njobs) )
             ]
    mm_skf_scoresD = {}
@ -271,53 +304,3 @@ def XGBClf(input_df, target, sel_cv
    return(mm_skf_scoresD)
 #%%
 sel_cv = skf_cv
 # param dict for getmldata()
 combined_model_paramD = {'data_combined_model'   : False
                    , 'use_or'                   : False
                    , 'omit_all_genomic_features': False
                    , 'write_maskfile'           : False
                    , 'write_outfile'            : False }
 #df = getmldata(gene, drug, **combined_model_paramD)
 df = getmldata('pncA', 'pyrazinamide', **combined_model_paramD)
 df2 = split_tts(df
          , data_type = 'actual'
          , split_type = '80_20'
          , oversampling = False
          , dst_colname = 'dst'
          , target_colname = 'dst_mode'
          , include_gene_name = True
          , random_state = 42 # default
      )
 all(df2['X'].columns.isin(['gene_name']))
 fooD = XGBClf (input_df = df2['X']
                , target = df2['y']
                , sel_cv = skf_cv
                , run_blind_test = True
                , blind_test_df =  df2['X_bts']
                , blind_test_target =  df2['y_bts']
                , tts_split_type  = '80_20'
                , var_type = 'mixed'
                , resampling_type = 'none' # default
 )
 for k, v in fooD.items():
    print('\nK:', k
          , '\nTRAIN MCC:', fooD[k]['test_mcc']
          ,  '\nBTS MCC:' , fooD[k]['bts_mcc'] )
 #%%
 # # fit model no training data
 # model = XGBClassifier()
 # model.fit( df2['X'], df2['y'])
 # # feature importance
 # print(model.feature_importances_)
 # # plot
 # pyplot.bar(range(len(model.feature_importances_)), model.feature_importances_)
 # pyplot.show()