diff --git a/scripts/ml/dummy_classifier.py b/scripts/ml/dummy_classifier.py index bbc4015..9ec1e92 100644 --- a/scripts/ml/dummy_classifier.py +++ b/scripts/ml/dummy_classifier.py @@ -62,6 +62,7 @@ X.columns y = df_clean.iloc[:,171] # dst y.value_counts() +######################### y2 = df_clean.iloc[:,172] #dst_mode y2.value_counts() @@ -107,3 +108,34 @@ acccuracy: TP+TN/TP+TN+FP+FN 114/71 + +###################################### +# try with CV + +X_eg = np.array([-1, 1, 1, 1, -2, 9, 4, 4, 1, -1, 3, 0]) +y_eg = np.array([0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1]) +dummy_clf = DummyClassifier(strategy="most_frequent") +dummy_clf.fit(X_eg, y_eg) +#DummyClassifier(strategy='most_frequent') +dummy_clf.predict(X_eg) +#dummy_clf.predict(np.array([1,1,1,1,1,1,1,1,1,1])) +#dummy_clf.predict_proba(X_eg) + +dummy_clf.score(X_eg, y_eg) + + +cv_DummyD = cross_validate(dummy_clf + , X_eg + , y_eg + , cv = 5 + #, groups = group + , scoring = scoring_fn + , return_train_score = True) + +cv_dummyD_ALL= {} +cv_dummyD_ALL['DUMMY'] = {} + +for key, value in cv_DummyD.items(): + print('\nkey:', key, '\nvalue:', value) + print('\nmean value:', np.mean(value)) + cv_dummyD_ALL['DUMMY'][key] = round(np.mean(value),2) \ No newline at end of file diff --git a/scripts/ml/ml_functions/MultClfs.py b/scripts/ml/ml_functions/MultClfs.py index dafe756..c22e41b 100755 --- a/scripts/ml/ml_functions/MultClfs.py +++ b/scripts/ml/ml_functions/MultClfs.py @@ -78,9 +78,10 @@ import itertools from sklearn.model_selection import LeaveOneGroupOut from sklearn.decomposition import PCA from sklearn.naive_bayes import ComplementNB +from sklearn.dummy import DummyClassifier #%% GLOBALS -#rs = {'random_state': 42} +#rs = {'random_state': 42} # INSIDE FUNCTION CALL NOW #njobs = {'n_jobs': os.cpu_count() } # the number of jobs should equal the number of CPU cores scoring_fn = ({ 'mcc' : make_scorer(matthews_corrcoef) @@ -261,37 +262,36 @@ def MultModelsCl(input_df, target #====================================================== models = [('AdaBoost Classifier' , AdaBoostClassifier(**rs) ) , ('Bagging Classifier' , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True, verbose = 3, n_estimators = 100) ) - #, ('Bernoulli NB' , BernoulliNB() ) # pks Naive Bayes, CAUTION - , ('Complement NB' , ComplementNB() ) - , ('Decision Tree' , DecisionTreeClassifier(**rs) ) - , ('Extra Tree' , ExtraTreeClassifier(**rs) ) - , ('Extra Trees' , ExtraTreesClassifier(**rs) ) - , ('Gradient Boosting' , GradientBoostingClassifier(**rs) ) - , ('Gaussian NB' , GaussianNB() ) - , ('Gaussian Process' , GaussianProcessClassifier(**rs) ) - , ('K-Nearest Neighbors' , KNeighborsClassifier() ) - , ('LDA' , LinearDiscriminantAnalysis() ) - , ('Logistic Regression' , LogisticRegression(**rs) ) - , ('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs)) - , ('MLP' , MLPClassifier(max_iter = 500, **rs) ) - , ('Multinomial NB' , MultinomialNB() ) - - , ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) ) - , ('QDA' , QuadraticDiscriminantAnalysis() ) - , ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000, **njobs ) ) - , ('Random Forest2' , RandomForestClassifier(min_samples_leaf = 5 - , n_estimators = 1000 - , bootstrap = True - , oob_score = True - , **njobs - , **rs - , max_features = 'auto') ) - , ('Ridge Classifier' , RidgeClassifier(**rs) ) - , ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) ) - , ('SVC' , SVC(**rs) ) - , ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) ) - , ('XGBoost' , XGBClassifier(**rs, verbosity = 0, use_label_encoder = False, **njobs) ) - + #, ('Bernoulli NB' , BernoulliNB() ) # pks Naive Bayes, CAUTION + , ('Complement NB' , ComplementNB() ) + , ('Decision Tree' , DecisionTreeClassifier(**rs) ) + , ('Extra Tree' , ExtraTreeClassifier(**rs) ) + , ('Extra Trees' , ExtraTreesClassifier(**rs) ) + , ('Gradient Boosting' , GradientBoostingClassifier(**rs) ) + , ('Gaussian NB' , GaussianNB() ) + , ('Gaussian Process' , GaussianProcessClassifier(**rs) ) + , ('K-Nearest Neighbors' , KNeighborsClassifier() ) + , ('LDA' , LinearDiscriminantAnalysis() ) + , ('Logistic Regression' , LogisticRegression(**rs) ) + , ('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs)) + , ('MLP' , MLPClassifier(max_iter = 500, **rs) ) + , ('Multinomial NB' , MultinomialNB() ) + , ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) ) + , ('QDA' , QuadraticDiscriminantAnalysis() ) + , ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000, **njobs ) ) + , ('Random Forest2' , RandomForestClassifier(min_samples_leaf = 5 + , n_estimators = 1000 + , bootstrap = True + , oob_score = True + , **njobs + , **rs + , max_features = 'auto') ) + , ('Ridge Classifier' , RidgeClassifier(**rs) ) + , ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) ) + , ('SVC' , SVC(**rs) ) + , ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) ) + , ('XGBoost' , XGBClassifier(**rs, verbosity = 0, use_label_encoder = False, **njobs) ) + , ('Dummy Classifier' , DummyClassifier(strategy = 'most_frequent') ) ] mm_skf_scoresD = {} diff --git a/scripts/ml/ml_functions/test_func_singlegene.py b/scripts/ml/ml_functions/test_func_singlegene.py index 707b188..18267dd 100644 --- a/scripts/ml/ml_functions/test_func_singlegene.py +++ b/scripts/ml/ml_functions/test_func_singlegene.py @@ -14,10 +14,11 @@ sys.path # import from GetMLData import * from SplitTTS import * -#from MultClfs import * -from MultClfs_SIMPLE import * +from MultClfs import * +#from MultClfs_SIMPLE import * #%% +rs = {'random_state': 42} skf_cv = StratifiedKFold(n_splits = 10 , shuffle = True,**rs) #sel_cv = logo @@ -28,12 +29,12 @@ skf_cv = StratifiedKFold(n_splits = 10 gene_model_paramD = {'data_combined_model' : False , 'use_or' : False , 'omit_all_genomic_features': False - , 'write_maskfile' : True + , 'write_maskfile' : False , 'write_outfile' : False } #df = getmldata(gene, drug, **gene_model_paramD) -df = getmldata('pncA', 'pyrazinamide', **gene_model_paramD) -#df = getmldata('embB', 'ethambutol' , **gene_model_paramD) +#df = getmldata('pncA', 'pyrazinamide', **gene_model_paramD) +df = getmldata('embB', 'ethambutol' , **gene_model_paramD) #df = getmldata('katG', 'isoniazid' , **gene_model_paramD) #df = getmldata('rpoB', 'rifampicin' , **gene_model_paramD) #df = getmldata('gid' , 'streptomycin' , **gene_model_paramD) @@ -68,9 +69,8 @@ len(df) Counter(df2['y']) Counter(df2['y_bts']) - -fooD = MultModelsCl(input_df = df2['X_ros'] - , target = df2['y_ros'] +fooD = MultModelsCl(input_df = df2['X'] + , target = df2['y'] , sel_cv = skf_cv , run_blind_test = True , blind_test_df = df2['X_bts'] @@ -87,7 +87,12 @@ for k, v in fooD.items(): , '\nTRAIN MCC:', fooD[k]['test_mcc'] , '\nBTS MCC:' , fooD[k]['bts_mcc'] , '\nDIFF:',fooD[k]['bts_mcc'] - fooD[k]['test_mcc'] ) - + +for k, v in fooD.items(): + print('\nModel:', k + , '\nTRAIN ACCURACY:', fooD[k]['test_accuracy'] + , '\nBTS ACCURACY:' , fooD[k]['bts_accuracy'] + , '\nDIFF:',fooD[k]['bts_accuracy'] - fooD[k]['test_accuracy'] ) #%% CHECK SCALING embb_df = getmldata('embB', 'ethambutol' , **combined_model_paramD) all(embb_df.columns.isin(['gene_name'])) # should be False diff --git a/scripts/ml/ml_iterator.py b/scripts/ml/ml_iterator.py index ea9002c..35b3c9d 100755 --- a/scripts/ml/ml_iterator.py +++ b/scripts/ml/ml_iterator.py @@ -82,22 +82,26 @@ for gene, drug in ml_gene_drugD.items(): , 'target' : tempD['y'] , 'var_type' : 'mixed' , 'resampling_type': 'none'} - , 'smnc_paramD': { 'input_df' : tempD['X_smnc'] - , 'target' : tempD['y_smnc'] + + , 'smnc_paramD' : { 'input_df' : tempD['X_smnc'] + , 'target' : tempD['y_smnc'] + , 'var_type' : 'mixed' + , 'resampling_type' : 'smnc'} + + , 'ros_paramD' : { 'input_df' : tempD['X_ros'] + , 'target' : tempD['y_ros'] , 'var_type' : 'mixed' - , 'resampling_type' : 'smnc'} - , 'ros_paramD': { 'input_df' : tempD['X_ros'] - , 'target' : tempD['y_ros'] - , 'var_type' : 'mixed' - , 'resampling_type' : 'ros'} - , 'rus_paramD' : { 'input_df' : tempD['X_rus'] + , 'resampling_type' : 'ros'} + + , 'rus_paramD' : { 'input_df' : tempD['X_rus'] , 'target' : tempD['y_rus'] , 'var_type' : 'mixed' , 'resampling_type' : 'rus'} - , 'rouC_paramD' : { 'input_df' : tempD['X_rouC'] - , 'target' : tempD['y_rouC'] - , 'var_type' : 'mixed' - , 'resampling_type': 'rouC'} + + , 'rouC_paramD' : { 'input_df' : tempD['X_rouC'] + , 'target' : tempD['y_rouC'] + , 'var_type' : 'mixed' + , 'resampling_type' : 'rouC'} } mmDD = {}