added combined model FS code and run script
This commit is contained in:
parent
78704dec5a
commit
2b953583e2
7 changed files with 1046 additions and 0 deletions
221
scripts/ml/untitled7_boruta.py
Normal file
221
scripts/ml/untitled7_boruta.py
Normal file
|
@ -0,0 +1,221 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Fri Sep 2 12:13:53 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
# https://analyticsindiamag.com/hands-on-guide-to-automated-feature-selection-using-boruta/
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
from boruta import BorutaPy
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.metrics import accuracy_score
|
||||
from sklearn.metrics import matthews_corrcoef
|
||||
|
||||
|
||||
URL = "https://raw.githubusercontent.com/Aditya1001001/English-Premier-League/master/pos_modelling_data.csv"
|
||||
data = pd.read_csv(URL)
|
||||
data.info()
|
||||
X = data.drop('Position', axis = 1)
|
||||
y = data['Position']
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 1)
|
||||
|
||||
rf_all_features = RandomForestClassifier(random_state=1, n_estimators=1000, max_depth=5)
|
||||
rf_all_features.fit(X_train, y_train)
|
||||
|
||||
|
||||
y_pred = rf_all_features.predict(X_test)
|
||||
|
||||
accuracy_score(y_test, rf_all_features.predict(X_test))
|
||||
accuracy_score(y_test, y_pred)
|
||||
matthews_corrcoef(y_test, rf_all_features.predict(X_test))
|
||||
|
||||
# BORUTA
|
||||
rfc = RandomForestClassifier(random_state=1, n_estimators=1000, max_depth=5)
|
||||
boruta_selector = BorutaPy(rfc, n_estimators='auto', verbose=2, random_state=1)
|
||||
boruta_selector.fit(np.array(X_train), np.array(y_train))
|
||||
|
||||
# Tells you how many features: GOOD
|
||||
print("Ranking: ",boruta_selector.ranking_)
|
||||
print("No. of significant features: ", boruta_selector.n_features_)
|
||||
|
||||
selected_rf_features = pd.DataFrame({'Feature':list(X_train.columns),
|
||||
'Ranking':boruta_selector.ranking_})
|
||||
|
||||
# tells you the ranking: GOOD
|
||||
selected_rf_features.sort_values(by='Ranking')
|
||||
|
||||
X_important_train = boruta_selector.transform(np.array(X_train))
|
||||
X_important_test = boruta_selector.transform(np.array(X_test))
|
||||
|
||||
rf_boruta = RandomForestClassifier(random_state=1, n_estimators=1000, max_depth=5)
|
||||
rf_boruta.fit(X_important_train, y_train)
|
||||
accuracy_score(y_test, rf_boruta.predict(X_important_test))
|
||||
matthews_corrcoef(y_test, rf_boruta.predict(X_important_test))
|
||||
|
||||
|
||||
|
||||
##############################################################################
|
||||
# my data : ONLY numerical values
|
||||
# from old scripts (cm_logo_skf_v2.py)
|
||||
fooD = combined_DF_OS(combined_df)
|
||||
|
||||
allF = fooD['X']
|
||||
numerical_ix = fooD['X'].select_dtypes(include=['int64', 'float64']).columns
|
||||
numerical_ix
|
||||
# just numerical for X_train and X_test
|
||||
X_train_numF = fooD['X'][numerical_ix]
|
||||
X_test_numF = fooD['X_bts'][numerical_ix]
|
||||
#X_train = allF
|
||||
|
||||
X_train = X_train_numF
|
||||
X_test = X_test_numF
|
||||
|
||||
y_train = fooD['y']
|
||||
y_test = fooD['y_bts']
|
||||
|
||||
# 1 model
|
||||
rf_all_features = RandomForestClassifier(random_state=1, n_estimators=1000, max_depth=5)
|
||||
rf_all_features.fit(X_train, y_train)
|
||||
|
||||
accuracy_score(y_test, rf_all_features.predict(X_test))
|
||||
matthews_corrcoef(y_test, rf_all_features.predict(X_test))
|
||||
|
||||
|
||||
# BORUTA
|
||||
rfc = RandomForestClassifier(random_state=1, n_estimators=1000, max_depth=5)
|
||||
boruta_selector = BorutaPy(rfc, n_estimators='auto', verbose=2, random_state=1)
|
||||
boruta_selector.fit(np.array(X_train), np.array(y_train))
|
||||
|
||||
# Tells you how many features: GOOD
|
||||
print("Ranking: ",boruta_selector.ranking_)
|
||||
print("No. of significant features: ", boruta_selector.n_features_)
|
||||
|
||||
selected_rf_features = pd.DataFrame({'Feature':list(X_train.columns),
|
||||
'Ranking':boruta_selector.ranking_})
|
||||
|
||||
# tells you the ranking: GOOD
|
||||
selected_rf_features.sort_values(by='Ranking')
|
||||
|
||||
X_important_train = boruta_selector.transform(np.array(X_train))
|
||||
X_important_test = boruta_selector.transform(np.array(X_test))
|
||||
|
||||
rf_boruta = RandomForestClassifier(random_state=1, n_estimators=1000, max_depth=5)
|
||||
rf_boruta.fit(X_important_train, y_train)
|
||||
accuracy_score(y_test, rf_boruta.predict(X_important_test))
|
||||
matthews_corrcoef(y_test, rf_boruta.predict(X_important_test))
|
||||
|
||||
##############################################################################
|
||||
# my data : using both numerical and categorical
|
||||
# from old scripts (cm_logo_skf_v2.py)
|
||||
fooD = combined_DF_OS(combined_df)
|
||||
|
||||
numerical_ix = fooD['X'].select_dtypes(include=['int64', 'float64']).columns
|
||||
numerical_ix
|
||||
print("\nNo. of numerical indices:", len(numerical_ix))
|
||||
|
||||
categorical_ix = fooD['X'].select_dtypes(include=['object', 'bool']).columns
|
||||
categorical_ix
|
||||
print("\nNo. of categorical indices:", len(categorical_ix))
|
||||
|
||||
|
||||
var_type = "mixeds"
|
||||
|
||||
if var_type == 'mixed':
|
||||
|
||||
t = [('num', MinMaxScaler(), numerical_ix)
|
||||
, ('cat', OneHotEncoder(), categorical_ix)]
|
||||
|
||||
col_transform = ColumnTransformer(transformers = t
|
||||
, remainder='passthrough')
|
||||
#--------------ALEX help
|
||||
# col_transform
|
||||
# col_transform.fit(X)
|
||||
# test = col_transform.transform(X)
|
||||
# print(col_transform.get_feature_names_out())
|
||||
|
||||
# foo = col_transform.fit_transform(X)
|
||||
Xm_train = col_transform.fit_transform(fooD['X'])
|
||||
fooD['X'].shape
|
||||
Xm_train.shape
|
||||
|
||||
Xm_test = col_transform.fit_transform(fooD['X_bts'])
|
||||
fooD['X_bts'].shape
|
||||
Xm_test.shape
|
||||
|
||||
X_train = Xm_train.copy()
|
||||
X_test = Xm_test.copy()
|
||||
X_train.shape
|
||||
X_test.shape
|
||||
|
||||
y_train = fooD['y']
|
||||
y_test = fooD['y_bts']
|
||||
y_train.shape
|
||||
y_test.shape
|
||||
|
||||
# perhaps
|
||||
#col_transform.fit(fooD['X'])
|
||||
#encoded_colnames = pd.Index(col_transform.get_feature_names_out())
|
||||
#======================
|
||||
# 1 model
|
||||
rf_all_features = RandomForestClassifier(random_state=1, n_estimators=1000, max_depth=5)
|
||||
rf_all_features.fit(X_train, y_train)
|
||||
|
||||
accuracy_score(y_test, rf_all_features.predict(X_test))
|
||||
matthews_corrcoef(y_test, rf_all_features.predict(X_test))
|
||||
|
||||
|
||||
# BORUTA
|
||||
rfc = RandomForestClassifier(random_state=1, n_estimators=1000, max_depth=5)
|
||||
boruta_selector = BorutaPy(rfc, n_estimators='auto', verbose=2, random_state=1)
|
||||
boruta_selector.fit(np.array(X_train), np.array(y_train))
|
||||
|
||||
# Tells you how many features: GOOD
|
||||
print("Ranking: ",boruta_selector.ranking_)
|
||||
print("No. of significant features: ", boruta_selector.n_features_)
|
||||
|
||||
#selected_rf_features = pd.DataFrame({'Feature':list(X_train.columns),
|
||||
# 'Ranking':boruta_selector.ranking_})
|
||||
|
||||
# tells you the ranking: GOOD
|
||||
foo2 = selected_rf_features.sort_values(by='Ranking')
|
||||
|
||||
X_important_train = boruta_selector.transform(np.array(X_train))
|
||||
X_important_test = boruta_selector.transform(np.array(X_test))
|
||||
|
||||
rf_boruta = RandomForestClassifier(random_state=1, n_estimators=1000, max_depth=5)
|
||||
rf_boruta.fit(X_important_train, y_train)
|
||||
accuracy_score(y_test, rf_boruta.predict(X_important_test))
|
||||
matthews_corrcoef(y_test, rf_boruta.predict(X_important_test))
|
||||
##################################
|
||||
# trying to one hot encode at start
|
||||
# perhaps
|
||||
#col_transform.fit(fooD['X'])
|
||||
#encoded_colnames = pd.Index(col_transform.get_feature_names_out())
|
||||
|
||||
# def encode_and_bind(original_dataframe, feature_to_encode):
|
||||
# dummies = pd.get_dummies(original_dataframe[[feature_to_encode]])
|
||||
# res = pd.concat([original_dataframe, dummies], axis=1)
|
||||
# res = res.drop([feature_to_encode], axis=1)
|
||||
# return(res)
|
||||
|
||||
# features_to_encode = ['feature_1', 'feature_2', 'feature_3',
|
||||
# 'feature_4']
|
||||
|
||||
# features_to_encode = list(categorical_ix.copy())
|
||||
|
||||
# for feature in features_to_encode:
|
||||
# X_train_enc = encode_and_bind(fooD['X'], feature)
|
||||
# X_test_enc = encode_and_bind(fooD['X_bts'], feature)
|
||||
|
||||
# c1 = X_train_enc.columns
|
||||
# c2 = X_test_enc.columns
|
||||
# X_train_enc.shape
|
||||
# X_test_enc.shape
|
||||
|
||||
# This one is better!
|
||||
a = pd.get_dummies(combined_df, columns=features_to_encode)
|
||||
a1=a.columns
|
||||
a2 = a.drop(['gene_name', 'dst', 'dst_mode'])
|
Loading…
Add table
Add a link
Reference in a new issue