121 lines
3.8 KiB
Python
121 lines
3.8 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
Created on Fri Sep 2 16:10:44 2022
|
|
|
|
@author: tanu
|
|
"""
|
|
from sklearn.ensemble import VotingClassifier
|
|
from sklearn.ensemble import BaggingClassifier
|
|
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, ExtraTreesClassifier
|
|
from boruta import BorutaPy
|
|
|
|
fooD = combined_DF_OS(combined_df)
|
|
|
|
numerical_ix = fooD['X'].select_dtypes(include=['int64', 'float64']).columns
|
|
numerical_ix
|
|
print("\nNo. of numerical indices:", len(numerical_ix))
|
|
|
|
categorical_ix = fooD['X'].select_dtypes(include=['object', 'bool']).columns
|
|
categorical_ix
|
|
print("\nNo. of categorical indices:", len(categorical_ix))
|
|
|
|
|
|
var_type = "mixed"
|
|
|
|
if var_type == 'mixed':
|
|
|
|
t = [('num', MinMaxScaler(), numerical_ix)
|
|
, ('cat', OneHotEncoder(), categorical_ix)]
|
|
|
|
col_transform = ColumnTransformer(transformers = t
|
|
, remainder='passthrough')
|
|
#--------------ALEX help
|
|
# col_transform
|
|
# col_transform.fit(X)
|
|
# test = col_transform.transform(X)
|
|
# print(col_transform.get_feature_names_out())
|
|
|
|
# foo = col_transform.fit_transform(X)
|
|
Xm_train = col_transform.fit_transform(fooD['X'])
|
|
fooD['X'].shape
|
|
Xm_train.shape
|
|
|
|
Xm_test = col_transform.fit_transform(fooD['X_bts'])
|
|
fooD['X_bts'].shape
|
|
Xm_test.shape
|
|
|
|
X_train = Xm_train.copy()
|
|
X_test = Xm_test.copy()
|
|
X_train.shape
|
|
X_test.shape
|
|
|
|
y_train = fooD['y']
|
|
y_test = fooD['y_bts']
|
|
y_train.shape
|
|
y_test.shape
|
|
|
|
# perhaps
|
|
#col_transform.fit(fooD['X'])
|
|
#encoded_colnames = pd.Index(col_transform.get_feature_names_out())
|
|
#======================
|
|
# 1 model
|
|
n_jobs = os.cpu_count()
|
|
njobs = {'n_jobs': n_jobs }
|
|
rs = {'random_state': 42}
|
|
|
|
rf_all_features = RandomForestClassifier(n_estimators=1000, max_depth=5
|
|
, **rs, **njobs)
|
|
|
|
#rf_all_features = VotingClassifier(estimators=1000)
|
|
rf_all_features = BaggingClassifier(random_state=1, n_estimators=100, verbose = 3, **njobs)
|
|
rf_all_features = AdaBoostClassifier(random_state=1, n_estimators=1000)
|
|
rf_all_features = ExtraTreesClassifier(random_state=1, n_estimators=1000, max_depth=5, verbose = 3)
|
|
rf_all_features = DecisionTreeClassifier(random_state=1, max_depth=5)
|
|
|
|
|
|
rf_all_features.fit(X_train, np.array(y_train))
|
|
accuracy_score(y_test, rf_all_features.predict(X_test))
|
|
matthews_corrcoef(y_test, rf_all_features.predict(X_test))
|
|
|
|
# BORUTA
|
|
boruta_selector = BorutaPy(rf_all_features,**rs, verbose = 3)
|
|
boruta_selector.fit(np.array(X_train), np.array(y_train))
|
|
|
|
# Tells you how many features: GOOD
|
|
print("Ranking: ", boruta_selector.ranking_)
|
|
print("No. of significant features: ", boruta_selector.n_features_)
|
|
|
|
|
|
cm_df = combined_df.drop(['gene_name', 'dst', 'dst_mode'], axis = 1)
|
|
col_transform.fit(cm_df)
|
|
col_transform.get_feature_names_out()
|
|
|
|
var_type_colnames = col_transform.get_feature_names_out()
|
|
var_type_colnames = pd.Index(var_type_colnames)
|
|
|
|
if var_type == 'mixed':
|
|
print('\nVariable type is:', var_type
|
|
, '\nNo. of columns in input_df:', len(cm_df.columns)
|
|
, '\nNo. of columns post one hot encoder:', len(var_type_colnames))
|
|
else:
|
|
print('\nNo. of columns in input_df:', len(input_df.columns))
|
|
|
|
|
|
selected_rf_features = pd.DataFrame({'Feature':list(var_type_colnames),
|
|
'Ranking':boruta_selector.ranking_})
|
|
sel_rf_features_sorted = selected_rf_features.sort_values(by='Ranking')
|
|
|
|
|
|
sel_features = var_type_colnames[boruta_selector.support_]
|
|
|
|
|
|
# tells you the ranking: GOOD
|
|
#foo2 = selected_rf_features.sort_values(by='Ranking')
|
|
|
|
X_important_train = boruta_selector.transform(np.array(X_train))
|
|
X_important_test = boruta_selector.transform(np.array(X_test))
|
|
|
|
rf_all_features.fit(X_important_train, y_train)
|
|
accuracy_score(y_test, rf_all_features.predict(X_important_test))
|
|
matthews_corrcoef(y_test, rf_all_features.predict(X_important_test))
|