diff --git a/imports.py b/imports.py index e1d03a2..40888f2 100644 --- a/imports.py +++ b/imports.py @@ -206,4 +206,4 @@ categorical_features_df = my_df[categorical_features_names] categorical_features_df.shape all_features_df = my_df[numerical_features_names + categorical_features_names] -all_features_df.shape +all_features_df.shape \ No newline at end of file diff --git a/my_data9.py b/my_data9.py new file mode 100644 index 0000000..6265051 --- /dev/null +++ b/my_data9.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Sat Mar 5 12:57:32 2022 + +@author: tanu +""" +#%% +# data, etc for now comes from my_data6.py and/or my_data5.py + +#%% try combinations +#import sys, os +#os.system("imports.py") +def precision(y_true,y_pred): + return precision_score(y_true,y_pred,pos_label = 1) +def recall(y_true,y_pred): + return recall_score(y_true, y_pred, pos_label = 1) +def f1(y_true,y_pred): + return f1_score(y_true, y_pred, pos_label = 1) + +#%% +numerical_features_df.shape +categorical_features_df.shape +all_features_df.shape +#%% +target = target1 +#target = target3 +X_trainN, X_testN, y_trainN, y_testN = train_test_split(numerical_features_df, + target, + test_size = 0.33, + random_state = 42) + +X_trainC, X_testC, y_trainC, y_testC = train_test_split(categorical_features_df, + target, + test_size = 0.33, + random_state = 42) + +X_train, X_test, y_train, y_test = train_test_split(all_features_df, + target, + test_size = 0.33, + random_state = 42) +#%% + +#%% +preprocessor = ColumnTransformer( + transformers=[ + ('num', MinMaxScaler() , numerical_features_names) + #,('cat', OneHotEncoder(), categorical_features_names) + ]) + +preprocessor.fit(numerical_features_df) +preprocessor.transform(numerical_features_df) +#%% +model_log = Pipeline(steps = [ + ('preprocess', preprocessor) + #,('log_reg', linear_model.LogisticRegression()) + ,('log_reg', LogisticRegression( + class_weight = 'unbalanced')) + ]) +model = model_log +#%% +seed = 42 +model_rf = Pipeline(steps = [ + ('preprocess', preprocessor) + ,('rf', RandomForestClassifier( + min_samples_leaf=50, + n_estimators=150, + bootstrap=True, + oob_score=True, + n_jobs=-1, + random_state=seed, + max_features='auto')) + ]) +model = model_rf +#%% +model.fit(X_trainN, y_trainN) +y_pred = model.predict(X_testN) +y_pred + +acc = make_scorer(accuracy_score) +prec = make_scorer(precision) +rec = make_scorer(recall) +f1 = make_scorer(f1) + +output = cross_validate(model, X_trainN, y_trainN + , scoring = {'acc' : acc + ,'prec': prec + ,'rec' : rec + ,'f1' : f1} + , cv = 10 + , return_train_score = False) +pd.DataFrame(output).mean() + + +#%% + +selector_logistic = RFECV(estimator = model + , cv = 10 + , step = 1) + +X_trainN, X_testN, y_trainN, y_testN = train_test_split(numerical_features_df + , target1 + , test_size = 0.33 + , random_state = 42) + +selector_logistic_xtrain = selector_logistic.fit_transform(X_trainN, y_trainN) +print(sel_rfe_logistic.get_support()) +X_trainN.columns + +print(sel_rfe_logistic.ranking_) \ No newline at end of file