#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Sat Mar 5 12:57:32 2022 @author: tanu """ #%% # data, etc for now comes from my_data6.py and/or my_data5.py #%% try combinations #import sys, os #os.system("imports.py") #%% seed = 42 features_to_encode = list(X_train.select_dtypes(include = ['object']).columns) col_trans = make_column_transformer( (OneHotEncoder(),features_to_encode), remainder = "passthrough" ) rf_classifier = RandomForestClassifier( min_samples_leaf=50, n_estimators=150, bootstrap=True, oob_score=True, n_jobs=-1, random_state=seed, max_features='auto') pipe = make_pipeline(col_trans, rf_classifier) pipe.fit(X_train, y_train) y_pred = pipe.predict(X_test) #%% all_features_df.shape X_train, X_test, y_train, y_test = train_test_split(all_features_df, target1, test_size = 0.33, random_state = 42) preprocessor = ColumnTransformer( transformers=[ ('num', MinMaxScaler() , numerical_features_df) ,('cat', OneHotEncoder(), categorical_features_df)]) seed = 42 rf_classifier = RandomForestClassifier( min_samples_leaf=50, n_estimators=150, bootstrap=True, oob_score=True, n_jobs=-1, random_state=seed, max_features='auto') preprocessor.fit(all_features_df) preprocessor.transform(all_features_df) model = Pipeline(steps = [ ('preprocess', preprocessor) ,('regression',linear_model.LogisticRegression()) ]) model.fit(X_train, y_train) y_pred = model.predict(X_test) y_pred def precision(y_true,y_pred): return precision_score(y_true,y_pred,pos_label = 1) def recall(y_true,y_pred): return recall_score(y_true, y_pred, pos_label = 1) def f1(y_true,y_pred): return f1_score(y_true, y_pred, pos_label = 1) acc = make_scorer(accuracy_score) prec = make_scorer(precision) rec = make_scorer(recall) f1 = make_scorer(f1) output = cross_validate(model, X_train, y_train , scoring = {'acc' : acc ,'prec': prec ,'rec' : rec ,'f1' : f1} , cv = 10 , return_train_score = False) pd.DataFrame(output).mean() #%% with feature selection preprocessor.fit(numerical_features_df) preprocessor.transform(numerical_features_df) model = Pipeline(steps = [ ('preprocess', preprocessor) ,('regression',linear_model.LogisticRegression()) ]) selector_logistic = RFECV(estimator = model , cv = 10 , step = 1) X_trainN, X_testN, y_trainN, y_testN = train_test_split(numerical_features_df , target1 , test_size = 0.33 , random_state = 42) selector_logistic_xtrain = selector_logistic.fit_transform(X_trainN, y_trainN) print(sel_rfe_logistic.get_support()) X_trainN.columns print(sel_rfe_logistic.ranking_)