saving work
This commit is contained in:
parent
652cf4802e
commit
79cb89a019
2 changed files with 39 additions and 56 deletions
|
@ -227,7 +227,7 @@ def MultModelsCl(input_df, target, skf_cv
|
||||||
, ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) )
|
, ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) )
|
||||||
, ('SVC' , SVC(**rs) )
|
, ('SVC' , SVC(**rs) )
|
||||||
, ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) )
|
, ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) )
|
||||||
, ('XGBoost' , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False) )
|
, ('XGBoost' , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False, **njobs) )
|
||||||
]
|
]
|
||||||
|
|
||||||
mm_skf_scoresD = {}
|
mm_skf_scoresD = {}
|
||||||
|
|
|
@ -74,6 +74,8 @@ from sklearn.impute import KNNImputer as KNN
|
||||||
import json
|
import json
|
||||||
import argparse
|
import argparse
|
||||||
import re
|
import re
|
||||||
|
from sklearn.model_selection import LeaveOneGroupOut
|
||||||
|
|
||||||
#%% GLOBALS
|
#%% GLOBALS
|
||||||
rs = {'random_state': 42}
|
rs = {'random_state': 42}
|
||||||
njobs = {'n_jobs': os.cpu_count() } # the number of jobs should equal the number of CPU cores
|
njobs = {'n_jobs': os.cpu_count() } # the number of jobs should equal the number of CPU cores
|
||||||
|
@ -95,6 +97,9 @@ rskf_cv = RepeatedStratifiedKFold(n_splits = 10
|
||||||
, n_repeats = 3
|
, n_repeats = 3
|
||||||
, **rs)
|
, **rs)
|
||||||
|
|
||||||
|
logo = LeaveOneGroupOut()
|
||||||
|
|
||||||
|
|
||||||
mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)}
|
mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)}
|
||||||
jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
|
jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
|
||||||
###############################################################################
|
###############################################################################
|
||||||
|
@ -116,7 +121,7 @@ def remove(string):
|
||||||
# Run Multiple Classifiers
|
# Run Multiple Classifiers
|
||||||
############################
|
############################
|
||||||
# Multiple Classification - Model Pipeline
|
# Multiple Classification - Model Pipeline
|
||||||
def XGBClf(input_df, target, sel_cv
|
def MultClfs_fi(input_df, target, sel_cv
|
||||||
, blind_test_df
|
, blind_test_df
|
||||||
, blind_test_target
|
, blind_test_target
|
||||||
, tts_split_type
|
, tts_split_type
|
||||||
|
@ -175,9 +180,37 @@ def XGBClf(input_df, target, sel_cv
|
||||||
#======================================================
|
#======================================================
|
||||||
# Specify multiple Classification Models
|
# Specify multiple Classification Models
|
||||||
#======================================================
|
#======================================================
|
||||||
models = [ ('XGBoost' , XGBClassifier(**rs, verbosity = 3, use_label_encoder = False, **njobs) )
|
models = [('AdaBoost Classifier' , AdaBoostClassifier(**rs) )
|
||||||
, ( 'Random Forest', RandomForestClassifier(**rs, **njobs, n_estimators = 1000))
|
# , ('Bagging Classifier' , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True) )
|
||||||
, ('Logistic Regression', LogisticRegression(**rs))]
|
# , ('Decision Tree' , DecisionTreeClassifier(**rs) )
|
||||||
|
# , ('Extra Tree' , ExtraTreeClassifier(**rs) )
|
||||||
|
# , ('Extra Trees' , ExtraTreesClassifier(**rs) )
|
||||||
|
# , ('Gradient Boosting' , GradientBoostingClassifier(**rs) )
|
||||||
|
# , ('Gaussian NB' , GaussianNB() )
|
||||||
|
# , ('Gaussian Process' , GaussianProcessClassifier(**rs) )
|
||||||
|
# , ('K-Nearest Neighbors' , KNeighborsClassifier() )
|
||||||
|
# , ('LDA' , LinearDiscriminantAnalysis() )
|
||||||
|
# , ('Logistic Regression' , LogisticRegression(**rs) )
|
||||||
|
# , ('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs))
|
||||||
|
# , ('MLP' , MLPClassifier(max_iter = 500, **rs) )
|
||||||
|
# , ('Multinomial' , MultinomialNB() )
|
||||||
|
# , ('Naive Bayes' , BernoulliNB() )
|
||||||
|
# , ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) )
|
||||||
|
# , ('QDA' , QuadraticDiscriminantAnalysis() )
|
||||||
|
# , ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000, **njobs ) )
|
||||||
|
# # , ('Random Forest2' , RandomForestClassifier(min_samples_leaf = 5
|
||||||
|
# # , n_estimators = 1000
|
||||||
|
# # , bootstrap = True
|
||||||
|
# # , oob_score = True
|
||||||
|
# # , **njobs
|
||||||
|
# # , **rs
|
||||||
|
# # , max_features = 'auto') )
|
||||||
|
# , ('Ridge Classifier' , RidgeClassifier(**rs) )
|
||||||
|
# , ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) )
|
||||||
|
# , ('SVC' , SVC(**rs) )
|
||||||
|
, ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) )
|
||||||
|
, ('XGBoost' , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False, **njobs) )
|
||||||
|
]
|
||||||
|
|
||||||
mm_skf_scoresD = {}
|
mm_skf_scoresD = {}
|
||||||
|
|
||||||
|
@ -271,53 +304,3 @@ def XGBClf(input_df, target, sel_cv
|
||||||
|
|
||||||
return(mm_skf_scoresD)
|
return(mm_skf_scoresD)
|
||||||
#%%
|
#%%
|
||||||
sel_cv = skf_cv
|
|
||||||
# param dict for getmldata()
|
|
||||||
combined_model_paramD = {'data_combined_model' : False
|
|
||||||
, 'use_or' : False
|
|
||||||
, 'omit_all_genomic_features': False
|
|
||||||
, 'write_maskfile' : False
|
|
||||||
, 'write_outfile' : False }
|
|
||||||
#df = getmldata(gene, drug, **combined_model_paramD)
|
|
||||||
df = getmldata('pncA', 'pyrazinamide', **combined_model_paramD)
|
|
||||||
|
|
||||||
df2 = split_tts(df
|
|
||||||
, data_type = 'actual'
|
|
||||||
, split_type = '80_20'
|
|
||||||
, oversampling = False
|
|
||||||
, dst_colname = 'dst'
|
|
||||||
, target_colname = 'dst_mode'
|
|
||||||
, include_gene_name = True
|
|
||||||
, random_state = 42 # default
|
|
||||||
)
|
|
||||||
|
|
||||||
all(df2['X'].columns.isin(['gene_name']))
|
|
||||||
|
|
||||||
|
|
||||||
fooD = XGBClf (input_df = df2['X']
|
|
||||||
, target = df2['y']
|
|
||||||
, sel_cv = skf_cv
|
|
||||||
, run_blind_test = True
|
|
||||||
, blind_test_df = df2['X_bts']
|
|
||||||
, blind_test_target = df2['y_bts']
|
|
||||||
, tts_split_type = '80_20'
|
|
||||||
, var_type = 'mixed'
|
|
||||||
, resampling_type = 'none' # default
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
for k, v in fooD.items():
|
|
||||||
print('\nK:', k
|
|
||||||
, '\nTRAIN MCC:', fooD[k]['test_mcc']
|
|
||||||
, '\nBTS MCC:' , fooD[k]['bts_mcc'] )
|
|
||||||
|
|
||||||
#%%
|
|
||||||
# # fit model no training data
|
|
||||||
# model = XGBClassifier()
|
|
||||||
# model.fit( df2['X'], df2['y'])
|
|
||||||
# # feature importance
|
|
||||||
# print(model.feature_importances_)
|
|
||||||
# # plot
|
|
||||||
# pyplot.bar(range(len(model.feature_importances_)), model.feature_importances_)
|
|
||||||
# pyplot.show()
|
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue