saving work

This commit is contained in:
Tanushree Tunstall 2022-07-05 16:06:03 +01:00
parent 652cf4802e
commit 79cb89a019
2 changed files with 39 additions and 56 deletions

View file

@ -74,6 +74,8 @@ from sklearn.impute import KNNImputer as KNN
import json
import argparse
import re
from sklearn.model_selection import LeaveOneGroupOut
#%% GLOBALS
rs = {'random_state': 42}
njobs = {'n_jobs': os.cpu_count() } # the number of jobs should equal the number of CPU cores
@ -95,6 +97,9 @@ rskf_cv = RepeatedStratifiedKFold(n_splits = 10
, n_repeats = 3
, **rs)
logo = LeaveOneGroupOut()
mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)}
jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
###############################################################################
@ -116,7 +121,7 @@ def remove(string):
# Run Multiple Classifiers
############################
# Multiple Classification - Model Pipeline
def XGBClf(input_df, target, sel_cv
def MultClfs_fi(input_df, target, sel_cv
, blind_test_df
, blind_test_target
, tts_split_type
@ -175,9 +180,37 @@ def XGBClf(input_df, target, sel_cv
#======================================================
# Specify multiple Classification Models
#======================================================
models = [ ('XGBoost' , XGBClassifier(**rs, verbosity = 3, use_label_encoder = False, **njobs) )
, ( 'Random Forest', RandomForestClassifier(**rs, **njobs, n_estimators = 1000))
, ('Logistic Regression', LogisticRegression(**rs))]
models = [('AdaBoost Classifier' , AdaBoostClassifier(**rs) )
# , ('Bagging Classifier' , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True) )
# , ('Decision Tree' , DecisionTreeClassifier(**rs) )
# , ('Extra Tree' , ExtraTreeClassifier(**rs) )
# , ('Extra Trees' , ExtraTreesClassifier(**rs) )
# , ('Gradient Boosting' , GradientBoostingClassifier(**rs) )
# , ('Gaussian NB' , GaussianNB() )
# , ('Gaussian Process' , GaussianProcessClassifier(**rs) )
# , ('K-Nearest Neighbors' , KNeighborsClassifier() )
# , ('LDA' , LinearDiscriminantAnalysis() )
# , ('Logistic Regression' , LogisticRegression(**rs) )
# , ('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs))
# , ('MLP' , MLPClassifier(max_iter = 500, **rs) )
# , ('Multinomial' , MultinomialNB() )
# , ('Naive Bayes' , BernoulliNB() )
# , ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) )
# , ('QDA' , QuadraticDiscriminantAnalysis() )
# , ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000, **njobs ) )
# # , ('Random Forest2' , RandomForestClassifier(min_samples_leaf = 5
# # , n_estimators = 1000
# # , bootstrap = True
# # , oob_score = True
# # , **njobs
# # , **rs
# # , max_features = 'auto') )
# , ('Ridge Classifier' , RidgeClassifier(**rs) )
# , ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) )
# , ('SVC' , SVC(**rs) )
, ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) )
, ('XGBoost' , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False, **njobs) )
]
mm_skf_scoresD = {}
@ -270,54 +303,4 @@ def XGBClf(input_df, target, sel_cv
mm_skf_scoresD[model_name]['bts_jcc'] = round(jaccard_score(blind_test_target, bts_predict),2)
return(mm_skf_scoresD)
#%%
sel_cv = skf_cv
# param dict for getmldata()
combined_model_paramD = {'data_combined_model' : False
, 'use_or' : False
, 'omit_all_genomic_features': False
, 'write_maskfile' : False
, 'write_outfile' : False }
#df = getmldata(gene, drug, **combined_model_paramD)
df = getmldata('pncA', 'pyrazinamide', **combined_model_paramD)
df2 = split_tts(df
, data_type = 'actual'
, split_type = '80_20'
, oversampling = False
, dst_colname = 'dst'
, target_colname = 'dst_mode'
, include_gene_name = True
, random_state = 42 # default
)
all(df2['X'].columns.isin(['gene_name']))
fooD = XGBClf (input_df = df2['X']
, target = df2['y']
, sel_cv = skf_cv
, run_blind_test = True
, blind_test_df = df2['X_bts']
, blind_test_target = df2['y_bts']
, tts_split_type = '80_20'
, var_type = 'mixed'
, resampling_type = 'none' # default
)
for k, v in fooD.items():
print('\nK:', k
, '\nTRAIN MCC:', fooD[k]['test_mcc']
, '\nBTS MCC:' , fooD[k]['bts_mcc'] )
#%%
# # fit model no training data
# model = XGBClassifier()
# model.fit( df2['X'], df2['y'])
# # feature importance
# print(model.feature_importances_)
# # plot
# pyplot.bar(range(len(model.feature_importances_)), model.feature_importances_)
# pyplot.show()
#%%