From 880ef46099332533ffe4e00ebfa5385271bf168c Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Fri, 8 Jul 2022 13:53:44 +0100 Subject: [PATCH] added CHECK_model --- scripts/ml/ml_functions/CHECK_model.py | 153 +++++++++++++++++++++++++ 1 file changed, 153 insertions(+) create mode 100644 scripts/ml/ml_functions/CHECK_model.py diff --git a/scripts/ml/ml_functions/CHECK_model.py b/scripts/ml/ml_functions/CHECK_model.py new file mode 100644 index 0000000..bf740da --- /dev/null +++ b/scripts/ml/ml_functions/CHECK_model.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Thu Jul 7 22:18:14 2022 + +@author: tanu +""" + + +# Create a pipeline that standardizes the data then creates a model +import pandas as pd + +from pandas import read_csv +from sklearn.model_selection import KFold +from sklearn.model_selection import cross_val_score +from sklearn.preprocessing import StandardScaler +from sklearn.pipeline import Pipeline +from sklearn.discriminant_analysis import LinearDiscriminantAnalysis + +# load data +url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv" +names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] +dataframe = read_csv(url, names=names) +array = dataframe.values +X = array[:,0:8] +X = pd.DataFrame(X) + +Y = array[:,8] +Y = pd.DataFrame(Y) +kfold = KFold(n_splits=1, random_state=None) +spl_type = "check" +fooD1 = MultModelsCl(input_df = X + , target = Y + , sel_cv = kfold + , run_blind_test = False + #, blind_test_df = df2['X_bts'] + #, blind_test_target = df2['y_bts'] + , add_cm = False + , add_yn = False + , tts_split_type = spl_type + , resampling_type = 'none' # default + , var_type = ['mixed'] + , scale_numeric = ['std'] + , return_formatted_output = True + ) + +# create pipeline +estimators = [] +estimators.append(('standardize', StandardScaler())) +estimators.append(('lda', LinearDiscriminantAnalysis())) +model = Pipeline(estimators) + +# evaluate pipeline +seed = 7 +#kfold = KFold(n_splits=10, random_state=seed) +kfold = KFold(n_splits=10, random_state=None) + +results = cross_val_score(model, X, Y, cv=kfold) +print(results.mean()) +results_A = round(results.mean(),2) + +results2 = cross_val_score(model, X, Y, cv=kfold, scoring = "recall") +print(results2.mean()) +results_R = round(results2.mean(),2) + +results3 = cross_val_score(model, X, Y, cv=kfold, scoring = "precision") +print(results3.mean()) +results_P = round(results3.mean(),2) + +results4 = cross_val_score(model, X, Y, cv=kfold, scoring = "f1") +print(results4.mean()) +results_f1 = round(results4.mean(),2) + +results5 = cross_val_score(model, X, Y, cv=kfold, scoring = "jaccard") +print(results5.mean()) +results_J = round(results5.mean(),2) + +results6 = cross_val_score(model, X, Y, cv=kfold, scoring = "matthews_corrcoef") +print(results6.mean()) +results_mcc = round(results6.mean(),2) + + +#%% +import numpy as np +from sklearn.compose import ColumnTransformer +from sklearn.datasets import fetch_openml +from sklearn.pipeline import Pipeline, make_pipeline +from sklearn.decomposition import PCA +from sklearn.impute import SimpleImputer, KNNImputer +from sklearn.preprocessing import RobustScaler, OneHotEncoder +from sklearn.ensemble import RandomForestClassifier +from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV + +X_train, X_test, y_train, y_test = train_test_split(X, Y, stratify=Y, test_size=0.2) + +fooD2 = MultModelsCl(input_df = X_train + , target = y_train + , sel_cv = kfold + , run_blind_test = True + , blind_test_df = X_test + , blind_test_target = y_test + , add_cm = False + , add_yn = False + , tts_split_type = spl_type + , resampling_type = 'none' # default + , var_type = ['mixed'] + , scale_numeric = ['std'] + , return_formatted_output = True + ) +# fitting and predicting on test +model.fit(X_train, y_train) +y_pred = model.predict(X_test) + + +results_A = round(cross_val_score(model, X_train, y_train, cv=kfold).mean(),2) +print(results_A) + +results_P = round(cross_val_score(model, X_train, y_train, cv=kfold, scoring = "precision").mean(),2) +print(results_P) + +results_R = round(cross_val_score(model, X_train, y_train, cv=kfold, scoring = "recall").mean(),2) +print(results_R) + +results_F = round(cross_val_score(model, X_train, y_train, cv=kfold, scoring = "f1").mean(),2) +print(results_F) + +results_J = round(cross_val_score(model, X_train, y_train, cv=kfold, scoring = "jaccard").mean(),2) +print(results_J) + +results_M = round(cross_val_score(model, X_train, y_train, cv=kfold, scoring = "matthews_corrcoef").mean(),2) +print(results_M) + +print('\nCV example accuracy:', results_P) +print('BTS example accuracy:', round(precision_score(y_test, y_pred),2)) + +print('\nCV example accuracy:', results_J) +print('BTS example accuracy:', round(jaccard_score(y_test, y_pred),2)) + +print('\nCV example accuracy:', results_R) +print('BTS example accuracy:', round(recall_score(y_test, y_pred),2)) + +print('\nCV example accuracy:', results_F) +print('BTS example accuracy:', round(f1_score(y_test, y_pred),2)) + +print('\nCV example accuracy:', results_A) +print('BTS example accuracy:', round(accuracy_score(y_test, y_pred),2)) + +print('\nCV example accuracy:', results_M) +print('BTS example accuracy:', round(matthews_corrcoef(y_test, y_pred),2)) + + + +