#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Sat Mar 5 12:57:32 2022 @author: tanu """ import os, sys import pandas as pd import numpy as np from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import BernoulliNB from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier from sklearn.neural_network import MLPClassifier from sklearn.pipeline import Pipeline from xgboost import XGBClassifier from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import MinMaxScaler from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score from sklearn.model_selection import cross_validate from sklearn.metrics import make_scorer from sklearn.metrics import classification_report from sklearn.feature_selection import RFE from sklearn.feature_selection import RFECV #%% model= Pipeline(steps = [ ('pre', MinMaxScaler()), ('reg', LogisticRegression(class_weight = 'balanced'))]) def precision(y_true,y_pred): return precision_score(y_true,y_pred,pos_label = 1) def recall(y_true,y_pred): return recall_score(y_true, y_pred, pos_label = 1) def f1(y_true,y_pred): return f1_score(y_true, y_pred, pos_label = 1) acc = make_scorer(accuracy_score) prec = make_scorer(precision) rec = make_scorer(recall) f1 = make_scorer(f1) output = cross_validate(model, X_train, y_train , scoring = {'acc' : acc ,'prec': prec ,'rec' : rec ,'f1' : f1} , cv = 10 , return_train_score = False) pd.DataFrame(output).mean() #%% # classification_repor: lowest scores but does it give numbers for all your classes! model.fit(X_train, y_train) y_pred = model.predict(X_test) f1_score(y_test, y_pred) roc_auc_score (y_test, y_pred) # not sure! #roc_curve(y_test, y_pred) classification_report(y_test, y_pred) target_names = {1:'Resistant', 0:'Sensitive'} print(classification_report(y_test , y_pred #, target_names=y_test.map(target_names) )) #%%NOT SURE! from itertools import combinations def train(X): return cross_validate(model, X, y_train #, scoring = make_scorer(accuracy_score) , scoring = {'acc' : acc ,'prec' : prec ,'rec' : rec ,'f1' : f1} , cv = 10 , return_train_score = False) #, return_estimator = True)['test_score'] scores = [train(X_train.loc[:,vars]) for vars in combinations(X_train.columns, len(X_train.columns))] means = [score.mean() for score in scores] means #%% # TO TRY https://rasbt.github.io/mlxtend/ # stackoverflow # informative post https://datascience.stackexchange.com/questions/937/does-scikit-learn-have-a-forward-selection-stepwise-regression-algorithm https://datascience.stackexchange.com/questions/24405/how-to-do-stepwise-regression-using-sklearn/24447#24447 https://stats.stackexchange.com/questions/204141/difference-between-selecting-features-based-on-f-regression-and-based-on-r2 # 0.24 version, it supports https://scikit-learn.org/stable/auto_examples/release_highlights/plot_release_highlights_0_24_0.html#new-sequentialfeatureselector-transformer https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectFromModel.html https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SequentialFeatureSelector.html https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFECV.html https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.f_regression.html https://www.scikit-yb.org/en/latest/api/model_selection/rfecv.html #%% RFE: Feature selection in classification # others in example # https://towardsdatascience.com/feature-selection-techniques-for-classification-and-python-tips-for-their-application-10c0ddd7918b # https://towardsdatascience.com/feature-selection-using-python-for-classification-problem-b5f00a1c7028 model_logistic = LogisticRegression(solver='lbfgs' , multi_class = 'multinomial' , max_iter = 1000) model_logistic = LogisticRegression() sel_rfe_logistic = RFE(estimator = model_logistic , n_features_to_select = 4 , step = 1) X_train_rfe_logistic = sel_rfe_logistic.fit_transform(X_train, y_train) print(sel_rfe_logistic.get_support()) print(sel_rfe_logistic.ranking_) #%% RFECV # https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFECV.html target = target1 target = target3 target = target4 X_train, X_test, y_train, y_test = train_test_split(X_vars1, target, test_size = 0.33, random_state = 42) X_train, X_test, y_train, y_test = train_test_split(X_vars2, target, test_size = 0.33, random_state = 42) X_train, X_test, y_train, y_test = train_test_split(X_vars3, target, test_size = 0.33, random_state = 42) X_train, X_test, y_train, y_test = train_test_split(X_vars5, target, test_size = 0.33, random_state = 42) X_train, X_test, y_train, y_test = train_test_split(X_vars11, target, test_size = 0.33, random_state = 42) model_logistic = LogisticRegression() sel_rfe_logistic = RFECV(estimator = model_logistic , cv = 10 , step = 1) X_train_rfe_logistic = sel_rfe_logistic.fit_transform(X_train, y_train) print(sel_rfe_logistic.get_support()) X_train.columns print(sel_rfe_logistic.ranking_)