From 79cb89a0197d1d73e733f754eb4a627d0ccf7e2a Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Tue, 5 Jul 2022 16:06:03 +0100 Subject: [PATCH] saving work --- scripts/ml/ml_functions/MultClfs.py | 2 +- scripts/ml/ml_functions/MultClfs_fi.py | 93 +++++++++++--------------- 2 files changed, 39 insertions(+), 56 deletions(-) diff --git a/scripts/ml/ml_functions/MultClfs.py b/scripts/ml/ml_functions/MultClfs.py index d3b684a..688caf3 100755 --- a/scripts/ml/ml_functions/MultClfs.py +++ b/scripts/ml/ml_functions/MultClfs.py @@ -227,7 +227,7 @@ def MultModelsCl(input_df, target, skf_cv , ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) ) , ('SVC' , SVC(**rs) ) , ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) ) - , ('XGBoost' , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False) ) + , ('XGBoost' , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False, **njobs) ) ] mm_skf_scoresD = {} diff --git a/scripts/ml/ml_functions/MultClfs_fi.py b/scripts/ml/ml_functions/MultClfs_fi.py index 3803bd7..89562e2 100644 --- a/scripts/ml/ml_functions/MultClfs_fi.py +++ b/scripts/ml/ml_functions/MultClfs_fi.py @@ -74,6 +74,8 @@ from sklearn.impute import KNNImputer as KNN import json import argparse import re +from sklearn.model_selection import LeaveOneGroupOut + #%% GLOBALS rs = {'random_state': 42} njobs = {'n_jobs': os.cpu_count() } # the number of jobs should equal the number of CPU cores @@ -95,6 +97,9 @@ rskf_cv = RepeatedStratifiedKFold(n_splits = 10 , n_repeats = 3 , **rs) +logo = LeaveOneGroupOut() + + mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)} jacc_score_fn = {'jcc': make_scorer(jaccard_score)} ############################################################################### @@ -116,7 +121,7 @@ def remove(string): # Run Multiple Classifiers ############################ # Multiple Classification - Model Pipeline -def XGBClf(input_df, target, sel_cv +def MultClfs_fi(input_df, target, sel_cv , blind_test_df , blind_test_target , tts_split_type @@ -175,9 +180,37 @@ def XGBClf(input_df, target, sel_cv #====================================================== # Specify multiple Classification Models #====================================================== - models = [ ('XGBoost' , XGBClassifier(**rs, verbosity = 3, use_label_encoder = False, **njobs) ) - , ( 'Random Forest', RandomForestClassifier(**rs, **njobs, n_estimators = 1000)) - , ('Logistic Regression', LogisticRegression(**rs))] + models = [('AdaBoost Classifier' , AdaBoostClassifier(**rs) ) + # , ('Bagging Classifier' , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True) ) + # , ('Decision Tree' , DecisionTreeClassifier(**rs) ) + # , ('Extra Tree' , ExtraTreeClassifier(**rs) ) + # , ('Extra Trees' , ExtraTreesClassifier(**rs) ) + # , ('Gradient Boosting' , GradientBoostingClassifier(**rs) ) + # , ('Gaussian NB' , GaussianNB() ) + # , ('Gaussian Process' , GaussianProcessClassifier(**rs) ) + # , ('K-Nearest Neighbors' , KNeighborsClassifier() ) + # , ('LDA' , LinearDiscriminantAnalysis() ) + # , ('Logistic Regression' , LogisticRegression(**rs) ) + # , ('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs)) + # , ('MLP' , MLPClassifier(max_iter = 500, **rs) ) + # , ('Multinomial' , MultinomialNB() ) + # , ('Naive Bayes' , BernoulliNB() ) + # , ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) ) + # , ('QDA' , QuadraticDiscriminantAnalysis() ) + # , ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000, **njobs ) ) + # # , ('Random Forest2' , RandomForestClassifier(min_samples_leaf = 5 + # # , n_estimators = 1000 + # # , bootstrap = True + # # , oob_score = True + # # , **njobs + # # , **rs + # # , max_features = 'auto') ) + # , ('Ridge Classifier' , RidgeClassifier(**rs) ) + # , ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) ) + # , ('SVC' , SVC(**rs) ) + , ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) ) + , ('XGBoost' , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False, **njobs) ) + ] mm_skf_scoresD = {} @@ -270,54 +303,4 @@ def XGBClf(input_df, target, sel_cv mm_skf_scoresD[model_name]['bts_jcc'] = round(jaccard_score(blind_test_target, bts_predict),2) return(mm_skf_scoresD) -#%% -sel_cv = skf_cv -# param dict for getmldata() -combined_model_paramD = {'data_combined_model' : False - , 'use_or' : False - , 'omit_all_genomic_features': False - , 'write_maskfile' : False - , 'write_outfile' : False } -#df = getmldata(gene, drug, **combined_model_paramD) -df = getmldata('pncA', 'pyrazinamide', **combined_model_paramD) - -df2 = split_tts(df - , data_type = 'actual' - , split_type = '80_20' - , oversampling = False - , dst_colname = 'dst' - , target_colname = 'dst_mode' - , include_gene_name = True - , random_state = 42 # default - ) - -all(df2['X'].columns.isin(['gene_name'])) - - -fooD = XGBClf (input_df = df2['X'] - , target = df2['y'] - , sel_cv = skf_cv - , run_blind_test = True - , blind_test_df = df2['X_bts'] - , blind_test_target = df2['y_bts'] - , tts_split_type = '80_20' - , var_type = 'mixed' - , resampling_type = 'none' # default -) - - -for k, v in fooD.items(): - print('\nK:', k - , '\nTRAIN MCC:', fooD[k]['test_mcc'] - , '\nBTS MCC:' , fooD[k]['bts_mcc'] ) - -#%% -# # fit model no training data -# model = XGBClassifier() -# model.fit( df2['X'], df2['y']) -# # feature importance -# print(model.feature_importances_) -# # plot -# pyplot.bar(range(len(model.feature_importances_)), model.feature_importances_) -# pyplot.show() - +#%% \ No newline at end of file