#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Mon May 16 05:59:12 2022 @author: tanu """ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Tue Mar 15 11:09:50 2022 @author: tanu """ #%% Logistic Regression + hyperparam + FS: BaseEstimator: ClfSwitcher() # model_lr = LogisticRegression(**rs) # model_rfecv = RFECV(estimator = model_lr # , cv = skf_cv # #, cv = 10 # , min_features_to_select = 1 # default # , scoring = 'matthews_corrcoef' # ) # param_grid2 = [ # { # #'clf': [LogisticRegression(**rs)], # #'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], # 'C': np.logspace(0, 4, 10), # 'penalty': ['none', 'l1', 'l2', 'elasticnet'], # 'max_iter': list(range(100,800,100)), # 'solver': ['saga'] # }, # { # #'clf': [LogisticRegression(**rs)], # #'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], # 'C': np.logspace(0, 4, 10), # 'penalty': ['l2', 'none'], # 'max_iter': list(range(100,800,100)), # 'solver': ['newton-cg', 'lbfgs', 'sag'] # }, # { # #'clf': [LogisticRegression(**rs)], # #'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], # 'C': np.logspace(0, 4, 10), # 'penalty': ['l1', 'l2'], # 'max_iter': list(range(100,800,100)), # 'solver': ['liblinear'] # } # ] # #------------------------------------------------------------------------------- # # Grid search CV + FS # gscv_lr = GridSearchCV(estimator = model_lr # , param_grid = param_grid2 # , scoring = mcc_score_fn, refit = 'mcc' # , cv = skf_cv # , return_train_score = False # , verbose = 3 # , **njobs) #------------------------------------------------------------------------------ ################ # NOTE: GS is going into pipeline, # Cannot get BEST model out # https://stackoverflow.com/questions/55609339/how-to-perform-feature-selection-with-gridsearchcv-in-sklearn-in-python ################ # Create pipeline # pipeline = Pipeline([('pre', MinMaxScaler()) # #, ('fs', sfs_selector) # , ('fs', model_rfecv ) # , ('clf', gscv_lr)]) # # Fit # dont assign fit # #lr_fs_fit = pipeline.fit(X,y) # pipeline.fit(X,y) # pipeline.best_params_ # #https://github.com/scikit-learn/scikit-learn/issues/7536 # n_fs = gscv_lr.best_estimator_.n_features_in_ # n_fs # sel_features = X.columns[pipeline.named_steps['fs'].get_support()] # print('\nNo. of features selected with RFECV for model' # , pipeline.named_steps['clf'].estimator # , ':', n_fs # , '\nThese are:', sel_features # ) ############################################################## # THIS ONE ######### # Make Pipeline go into GS with FS ######### # step 1: specify model #modLR = LogisticRegression(**rs) # step 2: specify fs #model_rfecv = RFECV(estimator = model_lr # , cv = skf_cv #, min_features_to_select = 1 # default #, scoring = 'matthews_corrcoef' #) # step 3: specify param grid as dict param_grid2 = [ {'fs__min_features_to_select': [1] , 'fs__cv': [skf_cv] }, # { # #'clf': [LogisticRegression(**rs)], # 'clf__C': np.logspace(0, 4, 10), # 'clf__penalty': ['none', 'l1', 'l2', 'elasticnet'], # 'clf__max_iter': list(range(100,800,100)), # 'clf__solver': ['saga'] # }, # { # #'clf': [LogisticRegression(**rs)], # 'clf__C': np.logspace(0, 4, 10), # 'clf__penalty': ['l2', 'none'], # 'clf__max_iter': list(range(100,800,100)), # 'clf__solver': ['newton-cg', 'lbfgs', 'sag'] # }, # { # #'clf': [LogisticRegression(**rs)], # 'clf__C': np.logspace(0, 4, 10), # 'clf__penalty': ['l1', 'l2'], # 'clf__max_iter': list(range(100,800,100)), # 'clf__solver': ['liblinear'] # } { #'clf': [LogisticRegression(**rs)], 'clf__C': np.logspace(0, 4, 10), 'clf__penalty': ['l2'], 'clf__max_iter': [100], 'clf__solver': ['liblinear'] }, { #'clf': [LogisticRegression(**rs)], 'clf__C': np.logspace(0, 4, 10), 'clf__penalty': ['l2'], 'clf__max_iter':[100], 'clf__solver': ['saga'] } ] # step 4: create pipeline pipeline = Pipeline([ ('pre', MinMaxScaler()) #, ('fs', model_rfecv) , ('fs', RFECV(LogisticRegression(**rs), scoring = 'matthews_corrcoef')) , ('clf', LogisticRegression(**rs))]) # step 5: Perform Gridsearch CV gs_final = GridSearchCV(pipeline , param_grid2 , cv = skf_cv , scoring = mcc_score_fn, refit = 'mcc' , verbose = 1 , return_train_score = False , **njobs) #fit gs_final.fit(X,y) gs_final.best_params_ gs_final.best_score_ gs_final.best_estimator_ # assign the fit #gsfit = gs_final.fit(X,y) #gsfit.best_estimator_ #gsfit.best_params_ #gsfit.best_score_ test_predict = gs_final.predict(X_bts) print(test_predict) print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, test_predict),2)) print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, test_predict),2)) # Now get the features out all_features = gs_final.feature_names_in_ #all_features = gsfit.feature_names_in_ sel_features = X.columns[gs_final.best_estimator_.named_steps['fs'].get_support()] n_sf = gs_final.best_estimator_.named_steps['fs'].n_features_ # get model name model_name = gs_final.best_estimator_.named_steps['clf'] b_model_params = gs_final.best_params_ print('\n========================================' , '\nRunning model:' , '\nModel name:', model_name , '\n===============================================' , '\nRunning feature selection with RFECV for model' , '\nTotal no. of features in model:', len(all_features) , '\nThese are:\n', all_features, '\n\n' , '\nNo of features for best model: ', n_sf , '\nThese are:', sel_features, '\n\n' , '\nBest Model hyperparams:', b_model_params ) ###################################### # Blind test ###################################### # See how it does on the BLIND test #print('\nBlind test score, mcc:', )) #test_predict = gscv_lr_fit.predict(X_bts) test_predict = gs_final.predict(X_bts) print(test_predict) print(accuracy_score(y_bts, test_predict)) print(matthews_corrcoef(y_bts, test_predict)) # create a dict with all scores lr_bts_dict = {#'best_model': list(gscv_lr_fit_be_mod.items()) 'bts_fscore':None , 'bts_mcc':None , 'bts_precision':None , 'bts_recall':None , 'bts_accuracy':None , 'bts_roc_auc':None , 'bts_jaccard':None } lr_bts_dict lr_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2) lr_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2) lr_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2) lr_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2) lr_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2) lr_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2) lr_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2) lr_bts_dict