From d14c3f9c4af647a5d1f7fff0ce4984e36d7447b3 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Thu, 7 Jul 2022 12:28:58 +0100 Subject: [PATCH] adde dummy classifier --- scripts/ml/dummy_classifier.py | 97 ++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 scripts/ml/dummy_classifier.py diff --git a/scripts/ml/dummy_classifier.py b/scripts/ml/dummy_classifier.py new file mode 100644 index 0000000..e084fd5 --- /dev/null +++ b/scripts/ml/dummy_classifier.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Thu Jul 7 10:54:09 2022 + +@author: tanu +""" + +import numpy as np +from sklearn.dummy import DummyClassifier + +X_eg = np.array([-1, 1, 1, 1]) +y_eg = np.array([0, 1, 1, 1]) +dummy_clf = DummyClassifier(strategy="most_frequent") +dummy_clf.fit(X_eg, y_eg) +DummyClassifier(strategy='most_frequent') +dummy_clf.predict(X_eg) +np.array([1, 1, 1, 1]) +dummy_clf.score(X_eg, y_eg) +0.75 + +dummy_clf.matthews_corrcoef(X_eg, y_eg) + + + +#%% +df['dst_mode'] +y_all_tt = df.loc[:,'dst'] +y_all_tt.value_counts() +#Counter(y_all_tt) +#0: 71, 1: 114 +y_all_tt.value_counts(normalize = True) + +df2['y'] +y_train_tt = df2['y'] +Counter(y_train_tt) +##0: 41, 1: 82 +y_train_tt.value_counts(normalize = True) + +df2['y_bts'] +y_bts_tt = df2['y_bts'] +Counter(y_bts_tt) +#0: 21, 1: 41 +y_bts_tt.value_counts(normalize = True) + +#%% +df_clean = df[df['dst'].notna()] +X = df_clean.iloc[:,0:171] +X.columns + +y = df_clean.iloc[:,171] # dst +y.value_counts() + +y2 = df_clean.iloc[:,172] #dst_mode +y2.value_counts() + +X_train_tt,X_test_tt, y_train_tt, y_test_tt = train_test_split(X, y, test_size=0.30, random_state=42, stratify = y) + +y2.value_counts() +round(y.value_counts(normalize = True),2) + +y_train_tt.value_counts() +round(y_train_tt.value_counts(normalize = True),2) + +y_test_tt.value_counts() +round(y_test_tt.value_counts(normalize = True),2) + +dummy_clf = DummyClassifier(strategy="most_frequent") +dummy_clf.fit(X_train_tt, y_train_tt) +DummyClassifier(strategy='most_frequent') +dummy_clf.predict(X_test_tt) + +# pnca: split 0/30 +======================= +Total y count in data: +1.0 114 (62%) +0.0 71 (38%) +======================= + +======================= +Train y count in data: +1.0 79 (61%) +0.0 50 (39%) +======================= + +======================= +Test y count in data: +1.0 35 (62%) +0.0 21 (38%) +======================= + + +acccuracy: + +TP+TN/TP+TN+FP+FN + +114/71