diff --git a/scripts/ml/ml_functions/MultClfs_SIMPLE.py b/scripts/ml/ml_functions/MultClfs_SIMPLE.py index b646da2..7e0fdba 100644 --- a/scripts/ml/ml_functions/MultClfs_SIMPLE.py +++ b/scripts/ml/ml_functions/MultClfs_SIMPLE.py @@ -144,10 +144,9 @@ scoreBT_mapD = {'bts_mcc' : 'MCC' ############################ # Multiple Classification - Model Pipeline def MultModelsCl(input_df, target - #, skf_cv , sel_cv - #, blind_test_df - #, blind_test_target + , blind_test_df + , blind_test_target , tts_split_type , resampling_type = 'none' # default @@ -230,37 +229,37 @@ def MultModelsCl(input_df, target #====================================================== # Specify multiple Classification Models #====================================================== - models = [('AdaBoost Classifier' , AdaBoostClassifier(**rs) ) - # , ('Bagging Classifier' , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True, verbose = 3, n_estimators = 100) ) - # , ('Decision Tree' , DecisionTreeClassifier(**rs) ) - # , ('Extra Tree' , ExtraTreeClassifier(**rs) ) - # , ('Extra Trees' , ExtraTreesClassifier(**rs) ) - # , ('Gradient Boosting' , GradientBoostingClassifier(**rs) ) - # , ('Gaussian NB' , GaussianNB() ) - # , ('Gaussian Process' , GaussianProcessClassifier(**rs) ) - # , ('K-Nearest Neighbors' , KNeighborsClassifier() ) + models = [('AdaBoost Classifier' , AdaBoostClassifier(**rs) ) + , ('Bagging Classifier' , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True, verbose = 3, n_estimators = 100) ) + , ('Decision Tree' , DecisionTreeClassifier(**rs) ) + , ('Extra Tree' , ExtraTreeClassifier(**rs) ) + , ('Extra Trees' , ExtraTreesClassifier(**rs) ) + , ('Gradient Boosting' , GradientBoostingClassifier(**rs) ) + , ('Gaussian NB' , GaussianNB() ) + , ('Gaussian Process' , GaussianProcessClassifier(**rs) ) + , ('K-Nearest Neighbors' , KNeighborsClassifier() ) , ('LDA' , LinearDiscriminantAnalysis() ) , ('Logistic Regression' , LogisticRegression(**rs) ) - # , ('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs)) - # , ('MLP' , MLPClassifier(max_iter = 500, **rs) ) - #, ('Multinomial' , MultinomialNB() ) - # , ('Naive Bayes' , BernoulliNB() ) - # , ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) ) - # , ('QDA' , QuadraticDiscriminantAnalysis() ) + , ('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs)) + , ('MLP' , MLPClassifier(max_iter = 500, **rs) ) + , ('Multinomial' , MultinomialNB() ) + , ('Naive Bayes' , BernoulliNB() ) + , ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) ) + , ('QDA' , QuadraticDiscriminantAnalysis() ) # , ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000, **njobs ) ) - # # , ('Random Forest2' , RandomForestClassifier(min_samples_leaf = 5 - # , n_estimators = 1000 - # , bootstrap = True - # , oob_score = True - # , **njobs - # , **rs - # , max_features = 'auto') ) - # , ('Ridge Classifier' , RidgeClassifier(**rs) ) - # , ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) ) - # , ('SVC' , SVC(**rs) ) - # , ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) ) + , ('Random Forest2' , RandomForestClassifier(min_samples_leaf = 5 + , n_estimators = 1000 + , bootstrap = True + , oob_score = True + , **njobs + , **rs + , max_features = 'auto') ) + , ('Ridge Classifier' , RidgeClassifier(**rs) ) + , ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) ) + , ('SVC' , SVC(**rs) ) + , ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) ) , ('XGBoost' , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False, **njobs) ) - # + ] mm_skf_scoresD = {} diff --git a/scripts/ml/ml_functions/test_func_singlegene.py b/scripts/ml/ml_functions/test_func_singlegene.py index 5d06db8..6abccb4 100644 --- a/scripts/ml/ml_functions/test_func_singlegene.py +++ b/scripts/ml/ml_functions/test_func_singlegene.py @@ -45,10 +45,13 @@ spl_type = '70_30' #spl_type = '80_20' #spl_type = 'sl' +#data_type = "actual" +data_type = "complete" + df2 = split_tts(df - , data_type = 'actual' + , data_type = data_type , split_type = spl_type - , oversampling = False + , oversampling = True , dst_colname = 'dst' , target_colname = 'dst_mode' , include_gene_name = True @@ -67,8 +70,8 @@ Counter(df2['y']) Counter(df2['y_bts']) -fooD = MultModelsCl(input_df = df2['X'] - , target = df2['y'] +fooD = MultModelsCl(input_df = df2['X_ros'] + , target = df2['y_ros'] , sel_cv = skf_cv , run_blind_test = True , blind_test_df = df2['X_bts'] @@ -83,7 +86,7 @@ fooD = MultModelsCl(input_df = df2['X'] for k, v in fooD.items(): print('\nModel:', k , '\nTRAIN MCC:', fooD[k]['test_mcc'] - , '\nBTS MCC:' , fooD[k]['bts_mcc'] + , '\nBTS MCC:' , fooD[k]['bts_mcc'] , '\nDIFF:',fooD[k]['bts_mcc'] - fooD[k]['test_mcc'] ) #%% CHECK SCALING diff --git a/scripts/ml/ml_iterator.py b/scripts/ml/ml_iterator.py index de8e62a..a633537 100755 --- a/scripts/ml/ml_iterator.py +++ b/scripts/ml/ml_iterator.py @@ -25,7 +25,7 @@ from GetMLData import * from SplitTTS import * # param dict for getmldata() -combined_model_paramD = {'data_combined_model' : False +gene_model_paramD = {'data_combined_model' : False , 'use_or' : False , 'omit_all_genomic_features': False , 'write_maskfile' : False @@ -48,7 +48,7 @@ for gene, drug in ml_gene_drugD.items(): , '\nDrug:', drug) gene_low = gene.lower() gene_dataD[gene_low] = getmldata(gene, drug - , **combined_model_paramD) + , **gene_model_paramD) for split_type in split_types: for data_type in split_data_types: