#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Fri Sep 2 12:13:53 2022 @author: tanu """ # https://analyticsindiamag.com/hands-on-guide-to-automated-feature-selection-using-boruta/ import pandas as pd import numpy as np from sklearn.ensemble import RandomForestClassifier from boruta import BorutaPy from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score from sklearn.metrics import matthews_corrcoef URL = "https://raw.githubusercontent.com/Aditya1001001/English-Premier-League/master/pos_modelling_data.csv" data = pd.read_csv(URL) data.info() X = data.drop('Position', axis = 1) y = data['Position'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 1) rf_all_features = RandomForestClassifier(random_state=1, n_estimators=1000, max_depth=5) rf_all_features.fit(X_train, y_train) y_pred = rf_all_features.predict(X_test) accuracy_score(y_test, rf_all_features.predict(X_test)) accuracy_score(y_test, y_pred) matthews_corrcoef(y_test, rf_all_features.predict(X_test)) # BORUTA rfc = RandomForestClassifier(random_state=1, n_estimators=1000, max_depth=5) boruta_selector = BorutaPy(rfc, n_estimators='auto', verbose=2, random_state=1) boruta_selector.fit(np.array(X_train), np.array(y_train)) # Tells you how many features: GOOD print("Ranking: ",boruta_selector.ranking_) print("No. of significant features: ", boruta_selector.n_features_) selected_rf_features = pd.DataFrame({'Feature':list(X_train.columns), 'Ranking':boruta_selector.ranking_}) # tells you the ranking: GOOD selected_rf_features.sort_values(by='Ranking') X_important_train = boruta_selector.transform(np.array(X_train)) X_important_test = boruta_selector.transform(np.array(X_test)) rf_boruta = RandomForestClassifier(random_state=1, n_estimators=1000, max_depth=5) rf_boruta.fit(X_important_train, y_train) accuracy_score(y_test, rf_boruta.predict(X_important_test)) matthews_corrcoef(y_test, rf_boruta.predict(X_important_test)) ############################################################################## # my data : ONLY numerical values # from old scripts (cm_logo_skf_v2.py) fooD = combined_DF_OS(combined_df) allF = fooD['X'] numerical_ix = fooD['X'].select_dtypes(include=['int64', 'float64']).columns numerical_ix # just numerical for X_train and X_test X_train_numF = fooD['X'][numerical_ix] X_test_numF = fooD['X_bts'][numerical_ix] #X_train = allF X_train = X_train_numF X_test = X_test_numF y_train = fooD['y'] y_test = fooD['y_bts'] # 1 model rf_all_features = RandomForestClassifier(random_state=1, n_estimators=1000, max_depth=5) rf_all_features.fit(X_train, y_train) accuracy_score(y_test, rf_all_features.predict(X_test)) matthews_corrcoef(y_test, rf_all_features.predict(X_test)) # BORUTA rfc = RandomForestClassifier(random_state=1, n_estimators=1000, max_depth=5) boruta_selector = BorutaPy(rfc, n_estimators='auto', verbose=2, random_state=1) boruta_selector.fit(np.array(X_train), np.array(y_train)) # Tells you how many features: GOOD print("Ranking: ",boruta_selector.ranking_) print("No. of significant features: ", boruta_selector.n_features_) selected_rf_features = pd.DataFrame({'Feature':list(X_train.columns), 'Ranking':boruta_selector.ranking_}) # tells you the ranking: GOOD selected_rf_features.sort_values(by='Ranking') X_important_train = boruta_selector.transform(np.array(X_train)) X_important_test = boruta_selector.transform(np.array(X_test)) rf_boruta = RandomForestClassifier(random_state=1, n_estimators=1000, max_depth=5) rf_boruta.fit(X_important_train, y_train) accuracy_score(y_test, rf_boruta.predict(X_important_test)) matthews_corrcoef(y_test, rf_boruta.predict(X_important_test)) ############################################################################## # my data : using both numerical and categorical # from old scripts (cm_logo_skf_v2.py) fooD = combined_DF_OS(combined_df) numerical_ix = fooD['X'].select_dtypes(include=['int64', 'float64']).columns numerical_ix print("\nNo. of numerical indices:", len(numerical_ix)) categorical_ix = fooD['X'].select_dtypes(include=['object', 'bool']).columns categorical_ix print("\nNo. of categorical indices:", len(categorical_ix)) var_type = "mixeds" if var_type == 'mixed': t = [('num', MinMaxScaler(), numerical_ix) , ('cat', OneHotEncoder(), categorical_ix)] col_transform = ColumnTransformer(transformers = t , remainder='passthrough') #--------------ALEX help # col_transform # col_transform.fit(X) # test = col_transform.transform(X) # print(col_transform.get_feature_names_out()) # foo = col_transform.fit_transform(X) Xm_train = col_transform.fit_transform(fooD['X']) fooD['X'].shape Xm_train.shape Xm_test = col_transform.fit_transform(fooD['X_bts']) fooD['X_bts'].shape Xm_test.shape X_train = Xm_train.copy() X_test = Xm_test.copy() X_train.shape X_test.shape y_train = fooD['y'] y_test = fooD['y_bts'] y_train.shape y_test.shape # perhaps #col_transform.fit(fooD['X']) #encoded_colnames = pd.Index(col_transform.get_feature_names_out()) #====================== # 1 model rf_all_features = RandomForestClassifier(random_state=1, n_estimators=1000, max_depth=5) rf_all_features.fit(X_train, y_train) accuracy_score(y_test, rf_all_features.predict(X_test)) matthews_corrcoef(y_test, rf_all_features.predict(X_test)) # BORUTA rfc = RandomForestClassifier(random_state=1, n_estimators=1000, max_depth=5) boruta_selector = BorutaPy(rfc, n_estimators='auto', verbose=2, random_state=1) boruta_selector.fit(np.array(X_train), np.array(y_train)) # Tells you how many features: GOOD print("Ranking: ",boruta_selector.ranking_) print("No. of significant features: ", boruta_selector.n_features_) #selected_rf_features = pd.DataFrame({'Feature':list(X_train.columns), # 'Ranking':boruta_selector.ranking_}) # tells you the ranking: GOOD foo2 = selected_rf_features.sort_values(by='Ranking') X_important_train = boruta_selector.transform(np.array(X_train)) X_important_test = boruta_selector.transform(np.array(X_test)) rf_boruta = RandomForestClassifier(random_state=1, n_estimators=1000, max_depth=5) rf_boruta.fit(X_important_train, y_train) accuracy_score(y_test, rf_boruta.predict(X_important_test)) matthews_corrcoef(y_test, rf_boruta.predict(X_important_test)) ################################## # trying to one hot encode at start # perhaps #col_transform.fit(fooD['X']) #encoded_colnames = pd.Index(col_transform.get_feature_names_out()) # def encode_and_bind(original_dataframe, feature_to_encode): # dummies = pd.get_dummies(original_dataframe[[feature_to_encode]]) # res = pd.concat([original_dataframe, dummies], axis=1) # res = res.drop([feature_to_encode], axis=1) # return(res) # features_to_encode = ['feature_1', 'feature_2', 'feature_3', # 'feature_4'] # features_to_encode = list(categorical_ix.copy()) # for feature in features_to_encode: # X_train_enc = encode_and_bind(fooD['X'], feature) # X_test_enc = encode_and_bind(fooD['X_bts'], feature) # c1 = X_train_enc.columns # c2 = X_test_enc.columns # X_train_enc.shape # X_test_enc.shape # This one is better! a = pd.get_dummies(combined_df, columns=features_to_encode) a1=a.columns a2 = a.drop(['gene_name', 'dst', 'dst_mode'])