#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Fri Sep 2 16:10:44 2022 @author: tanu """ from sklearn.ensemble import VotingClassifier from sklearn.ensemble import BaggingClassifier from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, ExtraTreesClassifier from boruta import BorutaPy fooD = combined_DF_OS(combined_df) numerical_ix = fooD['X'].select_dtypes(include=['int64', 'float64']).columns numerical_ix print("\nNo. of numerical indices:", len(numerical_ix)) categorical_ix = fooD['X'].select_dtypes(include=['object', 'bool']).columns categorical_ix print("\nNo. of categorical indices:", len(categorical_ix)) var_type = "mixed" if var_type == 'mixed': t = [('num', MinMaxScaler(), numerical_ix) , ('cat', OneHotEncoder(), categorical_ix)] col_transform = ColumnTransformer(transformers = t , remainder='passthrough') #--------------ALEX help # col_transform # col_transform.fit(X) # test = col_transform.transform(X) # print(col_transform.get_feature_names_out()) # foo = col_transform.fit_transform(X) Xm_train = col_transform.fit_transform(fooD['X']) fooD['X'].shape Xm_train.shape Xm_test = col_transform.fit_transform(fooD['X_bts']) fooD['X_bts'].shape Xm_test.shape X_train = Xm_train.copy() X_test = Xm_test.copy() X_train.shape X_test.shape y_train = fooD['y'] y_test = fooD['y_bts'] y_train.shape y_test.shape # perhaps #col_transform.fit(fooD['X']) #encoded_colnames = pd.Index(col_transform.get_feature_names_out()) #====================== # 1 model n_jobs = os.cpu_count() njobs = {'n_jobs': n_jobs } rs = {'random_state': 42} rf_all_features = RandomForestClassifier(n_estimators=1000, max_depth=5 , **rs, **njobs) #rf_all_features = VotingClassifier(estimators=1000) rf_all_features = BaggingClassifier(random_state=1, n_estimators=100, verbose = 3, **njobs) rf_all_features = AdaBoostClassifier(random_state=1, n_estimators=1000) rf_all_features = ExtraTreesClassifier(random_state=1, n_estimators=1000, max_depth=5, verbose = 3) rf_all_features = DecisionTreeClassifier(random_state=1, max_depth=5) rf_all_features.fit(X_train, np.array(y_train)) accuracy_score(y_test, rf_all_features.predict(X_test)) matthews_corrcoef(y_test, rf_all_features.predict(X_test)) # BORUTA boruta_selector = BorutaPy(rf_all_features,**rs, verbose = 3) boruta_selector.fit(np.array(X_train), np.array(y_train)) # Tells you how many features: GOOD print("Ranking: ", boruta_selector.ranking_) print("No. of significant features: ", boruta_selector.n_features_) cm_df = combined_df.drop(['gene_name', 'dst', 'dst_mode'], axis = 1) col_transform.fit(cm_df) col_transform.get_feature_names_out() var_type_colnames = col_transform.get_feature_names_out() var_type_colnames = pd.Index(var_type_colnames) if var_type == 'mixed': print('\nVariable type is:', var_type , '\nNo. of columns in input_df:', len(cm_df.columns) , '\nNo. of columns post one hot encoder:', len(var_type_colnames)) else: print('\nNo. of columns in input_df:', len(input_df.columns)) selected_rf_features = pd.DataFrame({'Feature':list(var_type_colnames), 'Ranking':boruta_selector.ranking_}) sel_rf_features_sorted = selected_rf_features.sort_values(by='Ranking') sel_features = var_type_colnames[boruta_selector.support_] # tells you the ranking: GOOD #foo2 = selected_rf_features.sort_values(by='Ranking') X_important_train = boruta_selector.transform(np.array(X_train)) X_important_test = boruta_selector.transform(np.array(X_test)) rf_all_features.fit(X_important_train, y_train) accuracy_score(y_test, rf_all_features.predict(X_important_test)) matthews_corrcoef(y_test, rf_all_features.predict(X_important_test))