added dummy classifier to models
This commit is contained in:
parent
c32005c99c
commit
744bc8f4a1
4 changed files with 94 additions and 53 deletions
|
@ -62,6 +62,7 @@ X.columns
|
|||
|
||||
y = df_clean.iloc[:,171] # dst
|
||||
y.value_counts()
|
||||
#########################
|
||||
|
||||
y2 = df_clean.iloc[:,172] #dst_mode
|
||||
y2.value_counts()
|
||||
|
@ -107,3 +108,34 @@ acccuracy:
|
|||
TP+TN/TP+TN+FP+FN
|
||||
|
||||
114/71
|
||||
|
||||
######################################
|
||||
# try with CV
|
||||
|
||||
X_eg = np.array([-1, 1, 1, 1, -2, 9, 4, 4, 1, -1, 3, 0])
|
||||
y_eg = np.array([0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1])
|
||||
dummy_clf = DummyClassifier(strategy="most_frequent")
|
||||
dummy_clf.fit(X_eg, y_eg)
|
||||
#DummyClassifier(strategy='most_frequent')
|
||||
dummy_clf.predict(X_eg)
|
||||
#dummy_clf.predict(np.array([1,1,1,1,1,1,1,1,1,1]))
|
||||
#dummy_clf.predict_proba(X_eg)
|
||||
|
||||
dummy_clf.score(X_eg, y_eg)
|
||||
|
||||
|
||||
cv_DummyD = cross_validate(dummy_clf
|
||||
, X_eg
|
||||
, y_eg
|
||||
, cv = 5
|
||||
#, groups = group
|
||||
, scoring = scoring_fn
|
||||
, return_train_score = True)
|
||||
|
||||
cv_dummyD_ALL= {}
|
||||
cv_dummyD_ALL['DUMMY'] = {}
|
||||
|
||||
for key, value in cv_DummyD.items():
|
||||
print('\nkey:', key, '\nvalue:', value)
|
||||
print('\nmean value:', np.mean(value))
|
||||
cv_dummyD_ALL['DUMMY'][key] = round(np.mean(value),2)
|
|
@ -78,9 +78,10 @@ import itertools
|
|||
from sklearn.model_selection import LeaveOneGroupOut
|
||||
from sklearn.decomposition import PCA
|
||||
from sklearn.naive_bayes import ComplementNB
|
||||
from sklearn.dummy import DummyClassifier
|
||||
|
||||
#%% GLOBALS
|
||||
#rs = {'random_state': 42}
|
||||
#rs = {'random_state': 42} # INSIDE FUNCTION CALL NOW
|
||||
#njobs = {'n_jobs': os.cpu_count() } # the number of jobs should equal the number of CPU cores
|
||||
|
||||
scoring_fn = ({ 'mcc' : make_scorer(matthews_corrcoef)
|
||||
|
@ -261,37 +262,36 @@ def MultModelsCl(input_df, target
|
|||
#======================================================
|
||||
models = [('AdaBoost Classifier' , AdaBoostClassifier(**rs) )
|
||||
, ('Bagging Classifier' , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True, verbose = 3, n_estimators = 100) )
|
||||
#, ('Bernoulli NB' , BernoulliNB() ) # pks Naive Bayes, CAUTION
|
||||
, ('Complement NB' , ComplementNB() )
|
||||
, ('Decision Tree' , DecisionTreeClassifier(**rs) )
|
||||
, ('Extra Tree' , ExtraTreeClassifier(**rs) )
|
||||
, ('Extra Trees' , ExtraTreesClassifier(**rs) )
|
||||
, ('Gradient Boosting' , GradientBoostingClassifier(**rs) )
|
||||
, ('Gaussian NB' , GaussianNB() )
|
||||
, ('Gaussian Process' , GaussianProcessClassifier(**rs) )
|
||||
, ('K-Nearest Neighbors' , KNeighborsClassifier() )
|
||||
, ('LDA' , LinearDiscriminantAnalysis() )
|
||||
, ('Logistic Regression' , LogisticRegression(**rs) )
|
||||
, ('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs))
|
||||
, ('MLP' , MLPClassifier(max_iter = 500, **rs) )
|
||||
, ('Multinomial NB' , MultinomialNB() )
|
||||
|
||||
, ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) )
|
||||
, ('QDA' , QuadraticDiscriminantAnalysis() )
|
||||
, ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000, **njobs ) )
|
||||
, ('Random Forest2' , RandomForestClassifier(min_samples_leaf = 5
|
||||
, n_estimators = 1000
|
||||
, bootstrap = True
|
||||
, oob_score = True
|
||||
, **njobs
|
||||
, **rs
|
||||
, max_features = 'auto') )
|
||||
, ('Ridge Classifier' , RidgeClassifier(**rs) )
|
||||
, ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) )
|
||||
, ('SVC' , SVC(**rs) )
|
||||
, ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) )
|
||||
, ('XGBoost' , XGBClassifier(**rs, verbosity = 0, use_label_encoder = False, **njobs) )
|
||||
|
||||
#, ('Bernoulli NB' , BernoulliNB() ) # pks Naive Bayes, CAUTION
|
||||
, ('Complement NB' , ComplementNB() )
|
||||
, ('Decision Tree' , DecisionTreeClassifier(**rs) )
|
||||
, ('Extra Tree' , ExtraTreeClassifier(**rs) )
|
||||
, ('Extra Trees' , ExtraTreesClassifier(**rs) )
|
||||
, ('Gradient Boosting' , GradientBoostingClassifier(**rs) )
|
||||
, ('Gaussian NB' , GaussianNB() )
|
||||
, ('Gaussian Process' , GaussianProcessClassifier(**rs) )
|
||||
, ('K-Nearest Neighbors' , KNeighborsClassifier() )
|
||||
, ('LDA' , LinearDiscriminantAnalysis() )
|
||||
, ('Logistic Regression' , LogisticRegression(**rs) )
|
||||
, ('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs))
|
||||
, ('MLP' , MLPClassifier(max_iter = 500, **rs) )
|
||||
, ('Multinomial NB' , MultinomialNB() )
|
||||
, ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) )
|
||||
, ('QDA' , QuadraticDiscriminantAnalysis() )
|
||||
, ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000, **njobs ) )
|
||||
, ('Random Forest2' , RandomForestClassifier(min_samples_leaf = 5
|
||||
, n_estimators = 1000
|
||||
, bootstrap = True
|
||||
, oob_score = True
|
||||
, **njobs
|
||||
, **rs
|
||||
, max_features = 'auto') )
|
||||
, ('Ridge Classifier' , RidgeClassifier(**rs) )
|
||||
, ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) )
|
||||
, ('SVC' , SVC(**rs) )
|
||||
, ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) )
|
||||
, ('XGBoost' , XGBClassifier(**rs, verbosity = 0, use_label_encoder = False, **njobs) )
|
||||
, ('Dummy Classifier' , DummyClassifier(strategy = 'most_frequent') )
|
||||
]
|
||||
|
||||
mm_skf_scoresD = {}
|
||||
|
|
|
@ -14,10 +14,11 @@ sys.path
|
|||
# import
|
||||
from GetMLData import *
|
||||
from SplitTTS import *
|
||||
#from MultClfs import *
|
||||
from MultClfs_SIMPLE import *
|
||||
from MultClfs import *
|
||||
#from MultClfs_SIMPLE import *
|
||||
|
||||
#%%
|
||||
rs = {'random_state': 42}
|
||||
skf_cv = StratifiedKFold(n_splits = 10
|
||||
, shuffle = True,**rs)
|
||||
#sel_cv = logo
|
||||
|
@ -28,12 +29,12 @@ skf_cv = StratifiedKFold(n_splits = 10
|
|||
gene_model_paramD = {'data_combined_model' : False
|
||||
, 'use_or' : False
|
||||
, 'omit_all_genomic_features': False
|
||||
, 'write_maskfile' : True
|
||||
, 'write_maskfile' : False
|
||||
, 'write_outfile' : False }
|
||||
|
||||
#df = getmldata(gene, drug, **gene_model_paramD)
|
||||
df = getmldata('pncA', 'pyrazinamide', **gene_model_paramD)
|
||||
#df = getmldata('embB', 'ethambutol' , **gene_model_paramD)
|
||||
#df = getmldata('pncA', 'pyrazinamide', **gene_model_paramD)
|
||||
df = getmldata('embB', 'ethambutol' , **gene_model_paramD)
|
||||
#df = getmldata('katG', 'isoniazid' , **gene_model_paramD)
|
||||
#df = getmldata('rpoB', 'rifampicin' , **gene_model_paramD)
|
||||
#df = getmldata('gid' , 'streptomycin' , **gene_model_paramD)
|
||||
|
@ -68,9 +69,8 @@ len(df)
|
|||
Counter(df2['y'])
|
||||
Counter(df2['y_bts'])
|
||||
|
||||
|
||||
fooD = MultModelsCl(input_df = df2['X_ros']
|
||||
, target = df2['y_ros']
|
||||
fooD = MultModelsCl(input_df = df2['X']
|
||||
, target = df2['y']
|
||||
, sel_cv = skf_cv
|
||||
, run_blind_test = True
|
||||
, blind_test_df = df2['X_bts']
|
||||
|
@ -88,6 +88,11 @@ for k, v in fooD.items():
|
|||
, '\nBTS MCC:' , fooD[k]['bts_mcc']
|
||||
, '\nDIFF:',fooD[k]['bts_mcc'] - fooD[k]['test_mcc'] )
|
||||
|
||||
for k, v in fooD.items():
|
||||
print('\nModel:', k
|
||||
, '\nTRAIN ACCURACY:', fooD[k]['test_accuracy']
|
||||
, '\nBTS ACCURACY:' , fooD[k]['bts_accuracy']
|
||||
, '\nDIFF:',fooD[k]['bts_accuracy'] - fooD[k]['test_accuracy'] )
|
||||
#%% CHECK SCALING
|
||||
embb_df = getmldata('embB', 'ethambutol' , **combined_model_paramD)
|
||||
all(embb_df.columns.isin(['gene_name'])) # should be False
|
||||
|
|
|
@ -82,22 +82,26 @@ for gene, drug in ml_gene_drugD.items():
|
|||
, 'target' : tempD['y']
|
||||
, 'var_type' : 'mixed'
|
||||
, 'resampling_type': 'none'}
|
||||
, 'smnc_paramD': { 'input_df' : tempD['X_smnc']
|
||||
, 'target' : tempD['y_smnc']
|
||||
|
||||
, 'smnc_paramD' : { 'input_df' : tempD['X_smnc']
|
||||
, 'target' : tempD['y_smnc']
|
||||
, 'var_type' : 'mixed'
|
||||
, 'resampling_type' : 'smnc'}
|
||||
|
||||
, 'ros_paramD' : { 'input_df' : tempD['X_ros']
|
||||
, 'target' : tempD['y_ros']
|
||||
, 'var_type' : 'mixed'
|
||||
, 'resampling_type' : 'smnc'}
|
||||
, 'ros_paramD': { 'input_df' : tempD['X_ros']
|
||||
, 'target' : tempD['y_ros']
|
||||
, 'var_type' : 'mixed'
|
||||
, 'resampling_type' : 'ros'}
|
||||
, 'rus_paramD' : { 'input_df' : tempD['X_rus']
|
||||
, 'resampling_type' : 'ros'}
|
||||
|
||||
, 'rus_paramD' : { 'input_df' : tempD['X_rus']
|
||||
, 'target' : tempD['y_rus']
|
||||
, 'var_type' : 'mixed'
|
||||
, 'resampling_type' : 'rus'}
|
||||
, 'rouC_paramD' : { 'input_df' : tempD['X_rouC']
|
||||
, 'target' : tempD['y_rouC']
|
||||
, 'var_type' : 'mixed'
|
||||
, 'resampling_type': 'rouC'}
|
||||
|
||||
, 'rouC_paramD' : { 'input_df' : tempD['X_rouC']
|
||||
, 'target' : tempD['y_rouC']
|
||||
, 'var_type' : 'mixed'
|
||||
, 'resampling_type' : 'rouC'}
|
||||
}
|
||||
|
||||
mmDD = {}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue