From 9071a8705608b95f947194a426de41b61d274f43 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Sat, 2 Jul 2022 11:12:39 +0100 Subject: [PATCH] fs: cut down the number of iterations --- scripts/ml/feature_selection_iterator.py | 144 ++++++++++++----------- 1 file changed, 74 insertions(+), 70 deletions(-) diff --git a/scripts/ml/feature_selection_iterator.py b/scripts/ml/feature_selection_iterator.py index b8ea10b..f4571b9 100755 --- a/scripts/ml/feature_selection_iterator.py +++ b/scripts/ml/feature_selection_iterator.py @@ -42,6 +42,7 @@ ml_gene_drugD = {'pncA' : 'pyrazinamide' # , 'gid' : 'streptomycin' } gene_dataD={} +# NOTE: for gid, run 'actual' on 80/20 and sl only split_types = ['70_30', '80_20', 'sl'] split_data_types = ['actual', 'complete'] #split_types = ['70_30'] @@ -49,98 +50,101 @@ split_data_types = ['actual', 'complete'] #fs_models = [('Logistic Regression' , LogisticRegression(**rs) )] -fs_models = [('AdaBoost Classifier' , AdaBoostClassifier(**rs) ) - , ('Decision Tree' , DecisionTreeClassifier(**rs) ) - #, ('Extra Tree' , ExtraTreeClassifier(**rs) ) - #, ('Extra Trees' , ExtraTreesClassifier(**rs) ) - #, ('Gradient Boosting' , GradientBoostingClassifier(**rs) ) - #, ('LDA' , LinearDiscriminantAnalysis() ) - #, ('Logistic Regression' , LogisticRegression(**rs) ) - #, ('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs)) - #, ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) ) - #, ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000 ) ) - #, ('Ridge Classifier' , RidgeClassifier(**rs) ) - #, ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) ) - #, ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) ) - ] +fs_models = [ + ('Logistic Regression' , LogisticRegression(**rs) ) + , ('Ridge Classifier' , RidgeClassifier(**rs) ) + #, ('AdaBoost Classifier' , AdaBoostClassifier(**rs) ) + #, ('Decision Tree' , DecisionTreeClassifier(**rs) ) + #, ('Extra Tree' , ExtraTreeClassifier(**rs) ) + #, ('Extra Trees' , ExtraTreesClassifier(**rs) ) + #, ('Gradient Boosting' , GradientBoostingClassifier(**rs) ) + #, ('LDA' , LinearDiscriminantAnalysis() ) + #, ('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs)) + #, ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) ) + #, ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000 ) ) + #, ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) ) + #, ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) ) + ] for gene, drug in ml_gene_drugD.items(): - print ('\nGene:', gene - , '\nDrug:', drug) + #print ('\nGene:', gene + # , '\nDrug:', drug) gene_low = gene.lower() gene_dataD[gene_low] = getmldata(gene, drug - , data_combined_model = False # this means it doesn't include 'gene_name' as a feauture as a single gene-target shouldn't have it. - , use_or = False - , omit_all_genomic_features = False - , write_maskfile = False - , write_outfile = False) + , data_combined_model = False # this means it doesn't include 'gene_name' as a feauture as a single gene-target shouldn't have it. + , use_or = False + , omit_all_genomic_features = False + , write_maskfile = False + , write_outfile = False) for split_type in split_types: for data_type in split_data_types: # unused per-split outfile #out_filename = outdir + gene.lower() + '_'+split_type+'_' + data_type + '.json' tempD=split_tts(gene_dataD[gene_low] - , data_type = data_type - , split_type = split_type - , oversampling = True # TURN IT ON TO RUN THE OTHERS BIS - , dst_colname = 'dst' - , target_colname = 'dst_mode' - , include_gene_name = True - ) + , data_type = data_type + , split_type = split_type + , oversampling = True # TURN IT ON TO RUN THE OTHERS BIS + , dst_colname = 'dst' + , target_colname = 'dst_mode' + , include_gene_name = True + ) paramD = { 'baseline_paramD': { 'input_df' : tempD['X'] - , 'target' : tempD['y'] - , 'var_type' : 'mixed' - , 'resampling_type': 'none'} - - , 'smnc_paramD' : { 'input_df' : tempD['X_smnc'] - , 'target' : tempD['y_smnc'] - , 'var_type' : 'mixed' - , 'resampling_type' : 'smnc'} - - , 'ros_paramD' : { 'input_df' : tempD['X_ros'] - , 'target' : tempD['y_ros'] - , 'var_type' : 'mixed' - , 'resampling_type' : 'ros'} - - , 'rus_paramD' : { 'input_df' : tempD['X_rus'] - , 'target' : tempD['y_rus'] - , 'var_type' : 'mixed' - , 'resampling_type' : 'rus'} - - , 'rouC_paramD' : { 'input_df' : tempD['X_rouC'] - , 'target' : tempD['y_rouC'] - , 'var_type' : 'mixed' - , 'resampling_type': 'rouC'} + , 'target' : tempD['y'] + , 'var_type' : 'mixed' + , 'resampling_type': 'none'} + #, 'smnc_paramD' : { 'input_df' : tempD['X_smnc'] + # , 'target' : tempD['y_smnc'] + # , 'var_type' : 'mixed' + # , 'resampling_type' : 'smnc'} + #, 'ros_paramD' : { 'input_df' : tempD['X_ros'] + # , 'target' : tempD['y_ros'] + # , 'var_type' : 'mixed' + # , 'resampling_type' : 'ros'} + #, 'rus_paramD' : { 'input_df' : tempD['X_rus'] + # , 'target' : tempD['y_rus'] + # , 'var_type' : 'mixed' + # , 'resampling_type' : 'rus'} + #, 'rouC_paramD' : { 'input_df' : tempD['X_rouC'] + # , 'target' : tempD['y_rouC'] + # , 'var_type' : 'mixed' + # , 'resampling_type': 'rouC'} } out_fsD = {} index = 1 for model_name, model_fn in fs_models: print('\nRunning classifier with FS:', index - , '\nModel_name:' , model_name - , '\nModel func:' , model_fn) - #, '\nList of models:', models) + , '\nModel_name:' , model_name + , '\nModel func:' , model_fn) + #, '\nList of models:', models) index = index+1 #out_fsD[model_name] = {} current_model = {} + for k, v in paramD.items(): - out_filename = (gene.lower() + '_' + split_type + '_' + data_type + '_' + model_name + '_' + k + '.json') + out_filename = gene.lower() + '_' + split_type + '_' + data_type + '_' + model_name + '_' + k + '.json' fsD_params=paramD[k] - + #out_fsD[model_name][k] = fsgs_rfecv( - thingg = foo( - ) + #current_model[k] = v + + # NOTE: this will silently fail with a syntax error if you don't have all the necessary libraries installed. + # Python will NOT warn you of the missing lib! current_model[k] = fsgs_rfecv( - **fsD_params - , param_gridLd = [{'fs__min_features_to_select': [1]}] - , blind_test_df = tempD['X_bts'] - , blind_test_target = tempD['y_bts'] - , estimator = model_fn - , use_fs = False # uses estimator as the RFECV parameter for fs. Set to TRUE if you want to supply custom_fs as shown below - # NOTE: IS THIS CORRECT?!? - , custom_fs = RFECV(DecisionTreeClassifier(**rs), cv = skf_cv, scoring = 'matthews_corrcoef') - , cv_method = skf_cv - ) - with open(out_filename, 'w') as f: - f.write(json.dumps(current_model) + **fsD_params + , param_gridLd = [{'fs__min_features_to_select': [1]}] + , blind_test_df = tempD['X_bts'] + , blind_test_target = tempD['y_bts'] + , estimator = model_fn + , use_fs = False # uses estimator as the RFECV parameter for fs. Set to TRUE if you want to supply custom_fs as shown below + , custom_fs = RFECV(DecisionTreeClassifier(**rs), cv = skf_cv, scoring = 'matthews_corrcoef') + , cv_method = skf_cv + ) + + # write current model to disk + #print(current_model) + out_json = json.dumps(current_model) + with open(out_filename, 'w', encoding="utf-8") as file: + file.write(out_json)