diff --git a/scripts/ml/feature_selection_iterator.py b/scripts/ml/feature_selection_iterator.py index f4571b9..3340d41 100755 --- a/scripts/ml/feature_selection_iterator.py +++ b/scripts/ml/feature_selection_iterator.py @@ -9,7 +9,7 @@ import sys, os import pandas as pd import numpy as np import re -#import prettyprint as pp +#import prettyprint as pp ############################################################################### homedir = os.path.expanduser("~") sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml/ml_functions') @@ -18,7 +18,7 @@ sys.path outdir = homedir + '/git/LSHTM_ML/output/fs/' #==================== -# Import ML functions +# Import ML functions #==================== from MultClfs import * @@ -27,43 +27,45 @@ from SplitTTS import * from FS import * # param dict for getmldata() combined_model_paramD = {'data_combined_model' : False - , 'use_or' : False - , 'omit_all_genomic_features': False - , 'write_maskfile' : False - , 'write_outfile' : False } + , 'use_or' : False + , 'omit_all_genomic_features': False + , 'write_maskfile' : False + , 'write_outfile' : False } ############################################################################### #ml_genes = ["pncA", "embB", "katG", "rpoB", "gid"] # outdir = homedir + '/git/Data/ml_combined/fs/' -ml_gene_drugD = {'pncA' : 'pyrazinamide' - # , 'embB' : 'ethambutol' - # , 'katG' : 'isoniazid' - # , 'rpoB' : 'rifampicin' - # , 'gid' : 'streptomycin' - } +ml_gene_drugD = { + 'pncA' : 'pyrazinamide', # NOTE: may need re-run for 80_20 and sl + #'embB' : 'ethambutol', + #'katG' : 'isoniazid', #NOTE: RF only for all split-types actual + #'rpoB' : 'rifampicin', + #'gid' : 'streptomycin' # NOTE: for gid, run 'actual' on 80/20 and sl only + } gene_dataD={} -# NOTE: for gid, run 'actual' on 80/20 and sl only -split_types = ['70_30', '80_20', 'sl'] -split_data_types = ['actual', 'complete'] -#split_types = ['70_30'] + +#split_types = ['70_30', '80_20', 'sl'] #split_data_types = ['actual', 'complete'] -#fs_models = [('Logistic Regression' , LogisticRegression(**rs) )] +split_types = ['70_30'] +#split_data_types = ['actual', 'complete'] +split_data_types = ['actual'] fs_models = [ - ('Logistic Regression' , LogisticRegression(**rs) ) - , ('Ridge Classifier' , RidgeClassifier(**rs) ) - #, ('AdaBoost Classifier' , AdaBoostClassifier(**rs) ) - #, ('Decision Tree' , DecisionTreeClassifier(**rs) ) - #, ('Extra Tree' , ExtraTreeClassifier(**rs) ) - #, ('Extra Trees' , ExtraTreesClassifier(**rs) ) - #, ('Gradient Boosting' , GradientBoostingClassifier(**rs) ) - #, ('LDA' , LinearDiscriminantAnalysis() ) - #, ('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs)) - #, ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) ) - #, ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000 ) ) - #, ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) ) - #, ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) ) + #('Ridge Classifier' , RidgeClassifier(**rs) ), + #('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) ), + #('Logistic Regression' , LogisticRegression(**rs, **njobs) ), + #('AdaBoost Classifier' , AdaBoostClassifier(**rs) ), + #('Gradient Boosting' , GradientBoostingClassifier(**rs) ), + #('Stochastic GDescent' , SGDClassifier(**rs, **njobs) ), + #('Decision Tree' , DecisionTreeClassifier(**rs) ), + #('Extra Trees' , ExtraTreesClassifier(**rs, **njobs) ), + #('Extra Tree' , ExtraTreeClassifier(**rs) ), + #('LDA' , LinearDiscriminantAnalysis() ), + #('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs, **njobs) ), + #('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) ) + #('Random Forest' , RandomForestClassifier(n_estimators = 1000, verbose=3, **rs, **njobs ) ) + ('XGBoost' , XGBClassifier(verbosity=3, use_label_encoder=False, **rs, **njobs) ) ] for gene, drug in ml_gene_drugD.items(): @@ -122,9 +124,10 @@ for gene, drug in ml_gene_drugD.items(): index = index+1 #out_fsD[model_name] = {} current_model = {} + model_name_clean = model_name.replace(' ','-') for k, v in paramD.items(): - out_filename = gene.lower() + '_' + split_type + '_' + data_type + '_' + model_name + '_' + k + '.json' + out_filename = outdir + gene.lower() + '_' + split_type + '_' + data_type + '_' + model_name_clean + '_' + k + '.json' fsD_params=paramD[k] #out_fsD[model_name][k] = fsgs_rfecv( @@ -145,6 +148,8 @@ for gene, drug in ml_gene_drugD.items(): # write current model to disk #print(current_model) + print("⚠️ ⚠️ ⚠️ WRITING TO FILE: ", out_filename, "⚠️ ⚠️ ⚠️'") out_json = json.dumps(current_model) with open(out_filename, 'w', encoding="utf-8") as file: file.write(out_json) + print("⚠️ ⚠️ ⚠️ Finished writing to: ", out_filename, "⚠️ ⚠️ ⚠️'")