added dummy classifier to models

This commit is contained in:
Tanushree Tunstall 2022-07-27 17:10:04 +01:00
parent c32005c99c
commit 744bc8f4a1
4 changed files with 94 additions and 53 deletions

View file

@ -78,9 +78,10 @@ import itertools
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.decomposition import PCA
from sklearn.naive_bayes import ComplementNB
from sklearn.dummy import DummyClassifier
#%% GLOBALS
#rs = {'random_state': 42}
#rs = {'random_state': 42} # INSIDE FUNCTION CALL NOW
#njobs = {'n_jobs': os.cpu_count() } # the number of jobs should equal the number of CPU cores
scoring_fn = ({ 'mcc' : make_scorer(matthews_corrcoef)
@ -261,37 +262,36 @@ def MultModelsCl(input_df, target
#======================================================
models = [('AdaBoost Classifier' , AdaBoostClassifier(**rs) )
, ('Bagging Classifier' , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True, verbose = 3, n_estimators = 100) )
#, ('Bernoulli NB' , BernoulliNB() ) # pks Naive Bayes, CAUTION
, ('Complement NB' , ComplementNB() )
, ('Decision Tree' , DecisionTreeClassifier(**rs) )
, ('Extra Tree' , ExtraTreeClassifier(**rs) )
, ('Extra Trees' , ExtraTreesClassifier(**rs) )
, ('Gradient Boosting' , GradientBoostingClassifier(**rs) )
, ('Gaussian NB' , GaussianNB() )
, ('Gaussian Process' , GaussianProcessClassifier(**rs) )
, ('K-Nearest Neighbors' , KNeighborsClassifier() )
, ('LDA' , LinearDiscriminantAnalysis() )
, ('Logistic Regression' , LogisticRegression(**rs) )
, ('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs))
, ('MLP' , MLPClassifier(max_iter = 500, **rs) )
, ('Multinomial NB' , MultinomialNB() )
, ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) )
, ('QDA' , QuadraticDiscriminantAnalysis() )
, ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000, **njobs ) )
, ('Random Forest2' , RandomForestClassifier(min_samples_leaf = 5
, n_estimators = 1000
, bootstrap = True
, oob_score = True
, **njobs
, **rs
, max_features = 'auto') )
, ('Ridge Classifier' , RidgeClassifier(**rs) )
, ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) )
, ('SVC' , SVC(**rs) )
, ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) )
, ('XGBoost' , XGBClassifier(**rs, verbosity = 0, use_label_encoder = False, **njobs) )
#, ('Bernoulli NB' , BernoulliNB() ) # pks Naive Bayes, CAUTION
, ('Complement NB' , ComplementNB() )
, ('Decision Tree' , DecisionTreeClassifier(**rs) )
, ('Extra Tree' , ExtraTreeClassifier(**rs) )
, ('Extra Trees' , ExtraTreesClassifier(**rs) )
, ('Gradient Boosting' , GradientBoostingClassifier(**rs) )
, ('Gaussian NB' , GaussianNB() )
, ('Gaussian Process' , GaussianProcessClassifier(**rs) )
, ('K-Nearest Neighbors' , KNeighborsClassifier() )
, ('LDA' , LinearDiscriminantAnalysis() )
, ('Logistic Regression' , LogisticRegression(**rs) )
, ('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs))
, ('MLP' , MLPClassifier(max_iter = 500, **rs) )
, ('Multinomial NB' , MultinomialNB() )
, ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) )
, ('QDA' , QuadraticDiscriminantAnalysis() )
, ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000, **njobs ) )
, ('Random Forest2' , RandomForestClassifier(min_samples_leaf = 5
, n_estimators = 1000
, bootstrap = True
, oob_score = True
, **njobs
, **rs
, max_features = 'auto') )
, ('Ridge Classifier' , RidgeClassifier(**rs) )
, ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) )
, ('SVC' , SVC(**rs) )
, ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) )
, ('XGBoost' , XGBClassifier(**rs, verbosity = 0, use_label_encoder = False, **njobs) )
, ('Dummy Classifier' , DummyClassifier(strategy = 'most_frequent') )
]
mm_skf_scoresD = {}

View file

@ -14,10 +14,11 @@ sys.path
# import
from GetMLData import *
from SplitTTS import *
#from MultClfs import *
from MultClfs_SIMPLE import *
from MultClfs import *
#from MultClfs_SIMPLE import *
#%%
rs = {'random_state': 42}
skf_cv = StratifiedKFold(n_splits = 10
, shuffle = True,**rs)
#sel_cv = logo
@ -28,12 +29,12 @@ skf_cv = StratifiedKFold(n_splits = 10
gene_model_paramD = {'data_combined_model' : False
, 'use_or' : False
, 'omit_all_genomic_features': False
, 'write_maskfile' : True
, 'write_maskfile' : False
, 'write_outfile' : False }
#df = getmldata(gene, drug, **gene_model_paramD)
df = getmldata('pncA', 'pyrazinamide', **gene_model_paramD)
#df = getmldata('embB', 'ethambutol' , **gene_model_paramD)
#df = getmldata('pncA', 'pyrazinamide', **gene_model_paramD)
df = getmldata('embB', 'ethambutol' , **gene_model_paramD)
#df = getmldata('katG', 'isoniazid' , **gene_model_paramD)
#df = getmldata('rpoB', 'rifampicin' , **gene_model_paramD)
#df = getmldata('gid' , 'streptomycin' , **gene_model_paramD)
@ -68,9 +69,8 @@ len(df)
Counter(df2['y'])
Counter(df2['y_bts'])
fooD = MultModelsCl(input_df = df2['X_ros']
, target = df2['y_ros']
fooD = MultModelsCl(input_df = df2['X']
, target = df2['y']
, sel_cv = skf_cv
, run_blind_test = True
, blind_test_df = df2['X_bts']
@ -87,7 +87,12 @@ for k, v in fooD.items():
, '\nTRAIN MCC:', fooD[k]['test_mcc']
, '\nBTS MCC:' , fooD[k]['bts_mcc']
, '\nDIFF:',fooD[k]['bts_mcc'] - fooD[k]['test_mcc'] )
for k, v in fooD.items():
print('\nModel:', k
, '\nTRAIN ACCURACY:', fooD[k]['test_accuracy']
, '\nBTS ACCURACY:' , fooD[k]['bts_accuracy']
, '\nDIFF:',fooD[k]['bts_accuracy'] - fooD[k]['test_accuracy'] )
#%% CHECK SCALING
embb_df = getmldata('embB', 'ethambutol' , **combined_model_paramD)
all(embb_df.columns.isin(['gene_name'])) # should be False