added combined model FS code and run script

2022-09-03 12:28:36 +01:00 · 2022-09-03 12:28:36 +01:00 · 2b953583e2
commit 2b953583e2
parent 78704dec5a
7 changed files with 1046 additions and 0 deletions
--- a/scripts/ml/untitled7_boruta.py
+++ b/scripts/ml/untitled7_boruta.py
@ -0,0 +1,221 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Sep  2 12:13:53 2022
+
+@author: tanu
+"""
+# https://analyticsindiamag.com/hands-on-guide-to-automated-feature-selection-using-boruta/
+import pandas as pd
+import numpy as np
+from sklearn.ensemble import RandomForestClassifier
+from boruta import BorutaPy
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import accuracy_score 
+from sklearn.metrics import matthews_corrcoef
+
+
+URL = "https://raw.githubusercontent.com/Aditya1001001/English-Premier-League/master/pos_modelling_data.csv"
+data = pd.read_csv(URL)
+data.info()
+X = data.drop('Position', axis = 1)
+y = data['Position']
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 1) 
+
+rf_all_features = RandomForestClassifier(random_state=1, n_estimators=1000, max_depth=5)
+rf_all_features.fit(X_train, y_train) 
+
+
+y_pred = rf_all_features.predict(X_test)
+
+accuracy_score(y_test, rf_all_features.predict(X_test))
+accuracy_score(y_test, y_pred)
+matthews_corrcoef(y_test, rf_all_features.predict(X_test))
+
+# BORUTA
+rfc = RandomForestClassifier(random_state=1, n_estimators=1000, max_depth=5)
+boruta_selector = BorutaPy(rfc, n_estimators='auto', verbose=2, random_state=1)
+boruta_selector.fit(np.array(X_train), np.array(y_train))  
+
+# Tells you how many features: GOOD
+print("Ranking: ",boruta_selector.ranking_)          
+print("No. of significant features: ", boruta_selector.n_features_) 
+
+selected_rf_features = pd.DataFrame({'Feature':list(X_train.columns),
+                                       'Ranking':boruta_selector.ranking_})
+
+# tells you the ranking: GOOD
+selected_rf_features.sort_values(by='Ranking') 
+
+X_important_train = boruta_selector.transform(np.array(X_train))
+X_important_test = boruta_selector.transform(np.array(X_test)) 
+
+rf_boruta = RandomForestClassifier(random_state=1, n_estimators=1000, max_depth=5)
+rf_boruta.fit(X_important_train, y_train) 
+accuracy_score(y_test, rf_boruta.predict(X_important_test))
+matthews_corrcoef(y_test, rf_boruta.predict(X_important_test))
+
+
+
+##############################################################################
+# my data : ONLY numerical values
+# from old scripts (cm_logo_skf_v2.py)
+fooD = combined_DF_OS(combined_df)
+
+allF = fooD['X']
+numerical_ix = fooD['X'].select_dtypes(include=['int64', 'float64']).columns
+numerical_ix
+# just numerical for X_train and X_test
+X_train_numF = fooD['X'][numerical_ix]
+X_test_numF = fooD['X_bts'][numerical_ix]
+#X_train = allF
+
+X_train = X_train_numF 
+X_test = X_test_numF
+
+y_train = fooD['y']
+y_test = fooD['y_bts']
+
+#  1 model
+rf_all_features = RandomForestClassifier(random_state=1, n_estimators=1000, max_depth=5)
+rf_all_features.fit(X_train, y_train) 
+
+accuracy_score(y_test, rf_all_features.predict(X_test))
+matthews_corrcoef(y_test, rf_all_features.predict(X_test))
+
+
+# BORUTA
+rfc = RandomForestClassifier(random_state=1, n_estimators=1000, max_depth=5)
+boruta_selector = BorutaPy(rfc, n_estimators='auto', verbose=2, random_state=1)
+boruta_selector.fit(np.array(X_train), np.array(y_train))  
+
+# Tells you how many features: GOOD
+print("Ranking: ",boruta_selector.ranking_)          
+print("No. of significant features: ", boruta_selector.n_features_) 
+
+selected_rf_features = pd.DataFrame({'Feature':list(X_train.columns),
+                                       'Ranking':boruta_selector.ranking_})
+
+# tells you the ranking: GOOD
+selected_rf_features.sort_values(by='Ranking') 
+
+X_important_train = boruta_selector.transform(np.array(X_train))
+X_important_test = boruta_selector.transform(np.array(X_test)) 
+
+rf_boruta = RandomForestClassifier(random_state=1, n_estimators=1000, max_depth=5)
+rf_boruta.fit(X_important_train, y_train) 
+accuracy_score(y_test, rf_boruta.predict(X_important_test))
+matthews_corrcoef(y_test, rf_boruta.predict(X_important_test))
+
+##############################################################################
+# my data : using both numerical and categorical
+# from old scripts (cm_logo_skf_v2.py)
+fooD = combined_DF_OS(combined_df)
+
+numerical_ix = fooD['X'].select_dtypes(include=['int64', 'float64']).columns
+numerical_ix
+print("\nNo. of numerical indices:", len(numerical_ix))
+
+categorical_ix = fooD['X'].select_dtypes(include=['object', 'bool']).columns
+categorical_ix  
+print("\nNo. of categorical indices:", len(categorical_ix))
+
+
+var_type = "mixeds"
+
+if var_type == 'mixed':
+   
+    t = [('num', MinMaxScaler(), numerical_ix)
+         , ('cat', OneHotEncoder(), categorical_ix)]
+    
+col_transform = ColumnTransformer(transformers = t
+                                   , remainder='passthrough')
+#--------------ALEX help
+# col_transform
+# col_transform.fit(X)
+# test = col_transform.transform(X)
+# print(col_transform.get_feature_names_out())
+
+# foo = col_transform.fit_transform(X)
+Xm_train = col_transform.fit_transform(fooD['X'])
+fooD['X'].shape
+Xm_train.shape
+
+Xm_test = col_transform.fit_transform(fooD['X_bts'])
+fooD['X_bts'].shape
+Xm_test.shape
+
+X_train = Xm_train.copy()
+X_test = Xm_test.copy()
+X_train.shape
+X_test.shape
+
+y_train = fooD['y']
+y_test = fooD['y_bts']
+y_train.shape
+y_test.shape
+
+# perhaps
+#col_transform.fit(fooD['X'])
+#encoded_colnames = pd.Index(col_transform.get_feature_names_out())
+#======================
+#  1 model
+rf_all_features = RandomForestClassifier(random_state=1, n_estimators=1000, max_depth=5)
+rf_all_features.fit(X_train, y_train) 
+
+accuracy_score(y_test, rf_all_features.predict(X_test))
+matthews_corrcoef(y_test, rf_all_features.predict(X_test))
+
+
+# BORUTA
+rfc = RandomForestClassifier(random_state=1, n_estimators=1000, max_depth=5)
+boruta_selector = BorutaPy(rfc, n_estimators='auto', verbose=2, random_state=1)
+boruta_selector.fit(np.array(X_train), np.array(y_train))  
+
+# Tells you how many features: GOOD
+print("Ranking: ",boruta_selector.ranking_)          
+print("No. of significant features: ", boruta_selector.n_features_) 
+
+#selected_rf_features = pd.DataFrame({'Feature':list(X_train.columns),
+#                                       'Ranking':boruta_selector.ranking_})
+
+# tells you the ranking: GOOD
+foo2 = selected_rf_features.sort_values(by='Ranking') 
+
+X_important_train = boruta_selector.transform(np.array(X_train))
+X_important_test = boruta_selector.transform(np.array(X_test)) 
+
+rf_boruta = RandomForestClassifier(random_state=1, n_estimators=1000, max_depth=5)
+rf_boruta.fit(X_important_train, y_train) 
+accuracy_score(y_test, rf_boruta.predict(X_important_test))
+matthews_corrcoef(y_test, rf_boruta.predict(X_important_test))
+##################################
+# trying to one hot encode at start
+# perhaps
+#col_transform.fit(fooD['X'])
+#encoded_colnames = pd.Index(col_transform.get_feature_names_out())
+
+# def encode_and_bind(original_dataframe, feature_to_encode):
+#     dummies = pd.get_dummies(original_dataframe[[feature_to_encode]])
+#     res = pd.concat([original_dataframe, dummies], axis=1)
+#     res = res.drop([feature_to_encode], axis=1)
+#     return(res) 
+
+# features_to_encode = ['feature_1', 'feature_2', 'feature_3',
+#                       'feature_4']
+
+# features_to_encode = list(categorical_ix.copy())
+
+# for feature in features_to_encode:
+#     X_train_enc = encode_and_bind(fooD['X'], feature)
+#     X_test_enc  = encode_and_bind(fooD['X_bts'], feature)
+    
+# c1 = X_train_enc.columns
+# c2 = X_test_enc.columns
+# X_train_enc.shape   
+# X_test_enc.shape   
+
+# This one is better!
+a = pd.get_dummies(combined_df, columns=features_to_encode)
+a1=a.columns
+a2 = a.drop(['gene_name', 'dst', 'dst_mode'])