#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Mon May 16 05:59:12 2022 @author: tanu """ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Tue Mar 15 11:09:50 2022 @author: tanu """ # Attempting feature selection for LR WITHOUT ClfSwitcher Class #%% Import libraries, data, and scoring func: UQ_pnca_ML.py rs = {'random_state': 42} njobs = {'n_jobs': 10} #%% Logistic Regression + hyperparam + FS: Pipeline takes GridSearchCV (not the other way round!) model_lr = LogisticRegression(**rs) model_rfecv = RFECV(estimator = model_lr , cv = skf_cv #, cv = 10 , scoring = 'matthews_corrcoef' ) # model_sfs = SequentialFeatureSelector(estimator = model_lr # , n_features_to_select = 'auto' # , tol = None # # , cv = 10 # , cv = rskf_cv # # , direction ='backward' # , direction ='forward' # , **njobs) param_grid2 = [ { #'clf__estimator': [LogisticRegression(**rs)], 'C': np.logspace(0, 4, 10), 'penalty': ['none', 'l1', 'l2', 'elasticnet'], 'max_iter': list(range(100,800,100)), 'solver': ['saga'] }, { #'clf__estimator': [LogisticRegression(**rs)], 'C': np.logspace(0, 4, 10), 'penalty': ['l2', 'none'], 'max_iter': list(range(100,800,100)), 'solver': ['newton-cg', 'lbfgs', 'sag'] }, { #'clf__estimator': [LogisticRegression(**rs)], 'C': np.logspace(0, 4, 10), 'penalty': ['l1', 'l2'], 'max_iter': list(range(100,800,100)), 'solver': ['liblinear'] } # lesser params for testing # { 'C': np.logspace(0, 4, 10), # 'penalty': ['l1', 'l2'], # 'max_iter': [100], # 'solver': ['saga'] # }, # { 'C': [1], # 'penalty': ['l1'], # 'max_iter': [100], # 'solver': ['saga'] # } ] #------------------------------------------------------------------------------- # Grid search CV + FS gscv_lr = GridSearchCV(model_lr , param_grid2 , scoring = mcc_score_fn, refit = 'mcc' , cv = skf_cv , return_train_score = False , verbose = 3 , **njobs) #------------------------------------------------------------------------------ # Create pipeline pipeline2 = Pipeline([('pre', MinMaxScaler()) #, ('feature_selection', sfs_selector) , ('feature_selection', model_rfecv ) , ('clf', gscv_lr)]) # Fit pipeline2.fit(X,y) pipeline2.predict(X_bts) # Assigning fit an then running predict: sanity check #lr_fs = pipeline.fit(X,y) #lr_fs.predict(X_bts) ############################################################################### ##################### # Feature selection: AFTER model selection # https://towardsdatascience.com/5-feature-selection-method-from-scikit-learn-you-should-know-ed4d116e4172 ############################################################################### ###################################### # Blind test ###################################### # See how it does on the BLIND test #print('\nBlind test score, mcc:', )) #test_predict = gscv_lr_fit.predict(X_bts) test_predict = pipeline2.predict(X_bts) print(test_predict) print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, test_predict),2)) print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, test_predict),2)) # create a dict with all scores lr_bts_dict = {#'best_model': list(gscv_lr_fit_be_mod.items()) 'bts_fscore':None , 'bts_mcc':None , 'bts_precision':None , 'bts_recall':None , 'bts_accuracy':None , 'bts_roc_auc':None , 'bts_jaccard':None } lr_bts_dict lr_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2) lr_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2) lr_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2) lr_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2) lr_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2) lr_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2) lr_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2) lr_bts_dict # Create a df from dict with all scores lr_bts_df = pd.DataFrame.from_dict(lr_bts_dict,orient = 'index') lr_bts_df.columns = ['Logistic_Regression'] print(lr_bts_df) # d2 = {'best_model_params': lis(gscv_lr_fit_be_mod.items() )} # d2 # def Merge(dict1, dict2): # res = {**dict1, **dict2} # return res # d3 = Merge(d2, lr_bts_dict) # d3 # Create df with best model params model_params = pd.Series(['best_model_params', list(gscv_lr_fit_be_mod.items() )]) model_params_df = model_params.to_frame() model_params_df model_params_df.columns = ['Logistic_Regression'] model_params_df.columns # Combine the df of scores and the best model params lr_bts_df.columns lr_output = pd.concat([model_params_df, lr_bts_df], axis = 0) lr_output # Format the combined df # Drop the best_model_params row from lr_output lr_df = lr_output.drop([0], axis = 0) lr_df #FIXME: tidy the index of the formatted df ############################################################################### # FIXME: confusion matrix print(confusion_matrix(y_bts, test_predict)) #%% Feature selection ##################### # Feature selection: AFTER model selection? # ADD that within the loop # https://towardsdatascience.com/5-feature-selection-method-from-scikit-learn-you-should-know-ed4d116e4172 ##################### from sklearn.feature_selection import RFECV from sklearn.linear_model import LogisticRegression from sklearn.feature_selection import SelectFromModel from sklearn.feature_selection import SequentialFeatureSelector # RFE: ~ model coef or feature_importance rfe_selector = RFECV(estimator = LogisticRegression(**rs , penalty='l2' , solver='saga' , max_iter = 100 , C= 1.0) #, n_features_to_select = None # median by default , step = 1 , cv = 10) rfe_selector.fit(X, y) rfe_fs = X.columns[rfe_selector.get_support()] print('\nFeatures selected from Recursive Feature Elimination:', len(rfe_fs) , '\nThese are:', rfe_fs) # blind test TEST_PREDICT = rfe_selector.predict(X_bts) TEST_PREDICT print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, TEST_PREDICT),2)) print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, TEST_PREDICT),2)) # add pipeline with preprocessing: changes numbers pipe = Pipeline([ ('pre', MinMaxScaler()) #, ('fs', model_rfecv) , ('fs', rfe_selector) , ('clf', LogisticRegression(**rs))]) pipe.fit(X,y) tp = pipe.predict(X_bts) print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, tp),2)) print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, tp),2)) ################################## # SFM: ~ model coef or feature_importance sfm_selector = SelectFromModel(estimator = LogisticRegression(**rs , penalty='l1' , solver='saga' , max_iter = 100 , C= 1.0) , threshold = "median" , max_features = None ) # median by default sfm_selector.fit(X, y) sfm_fs = X.columns[sfm_selector.get_support()] print('\nFeatures selected from Select From Model:', len(sfm_fs) , '\nThese are:', sfm_fs) # SFS:ML CV sfs_selector = SequentialFeatureSelector(estimator = LogisticRegression(**rs , penalty='l1' , solver='saga' , max_iter = 100 , C = 1.0) , n_features_to_select = 'auto' , tol = None , cv = 10 #, cv = skf_cv # , direction ='backward' , direction ='forward' , **njobs) sfs_selector.fit(X, y) sfsb_fs = X.columns[sfs_selector.get_support()] print('\nFeatures selected from Sequential Feature Selector (Greedy):', len(sfsb_fs) , '\nThese are:', sfsb_fs) #Features selected from Sequential Feature Selector (Greedy, Backward): 7 [CV = SKF_CV] #These are: Index(['ligand_distance', 'duet_stability_change', 'ddg_foldx', 'deepddg', # 'contacts', 'rd_values', 'snap2_score'] #Features selected from Sequential Feature Selector (Greedy, Backward): 7 [CV=10] #These are: Index(['ligand_distance', 'deepddg', 'contacts', 'rsa', 'kd_values', # 'rd_values', 'maf'] #----- # Features selected from Sequential Feature Selector (Greedy, Forward): 6 [CV = SKF_CV] # These are: Index(['ligand_distance', 'ddg_dynamut2', 'rsa', 'kd_values', 'rd_values', 'maf'] # Features selected from Sequential Feature Selector (Greedy, Forward): 6 [CV = 10] #These are: Index(['duet_stability_change', 'deepddg', 'ddg_dynamut2', 'rsa', 'kd_values', 'maf'] ############################################################################### # IMP: nice eg of including it as part of pipeline # https://www.tomasbeuzen.com/post/scikit-learn-gridsearch-pipelines/