#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Thu Jul 7 10:54:09 2022 @author: tanu """ import numpy as np from sklearn.dummy import DummyClassifier X_eg = np.array([-1, 1, 1, 1]) y_eg = np.array([0, 1, 1, 1]) dummy_clf = DummyClassifier(strategy="most_frequent") dummy_clf = DummyClassifier(strategy="stratified") dummy_clf = DummyClassifier(strategy="stratified") dummy_clf.fit(X_eg, y_eg) #DummyClassifier(strategy='most_frequent') dummy_clf.predict(X_eg) dummy_clf.predict(np.array([1,1,1,1,1,1,1,1,1,1])) dummy_clf.predict_proba(X_eg) dummy_clf.score(X_eg, y_eg) 0.75 df2['X'] dummy_clf.fit(df2['X'], df2['y']) dummy_clf.predict(df2['X']) dummy_clf.predict_proba(df2['X']) ypred = dummy_clf.predict(df2['X']) dummy_clf.score(df2['X'], df2['y']) confusion_matrix(df2['y'], ypred) matthews_corrcoef(df2['y'], ypred) #%% df['dst_mode'] y_all_tt = df.loc[:,'dst'] y_all_tt.value_counts() #Counter(y_all_tt) #0: 71, 1: 114 y_all_tt.value_counts(normalize = True) df2['y'] y_train_tt = df2['y'] Counter(y_train_tt) ##0: 41, 1: 82 y_train_tt.value_counts(normalize = True) df2['y_bts'] y_bts_tt = df2['y_bts'] Counter(y_bts_tt) #0: 21, 1: 41 y_bts_tt.value_counts(normalize = True) #%% df_clean = df[df['dst'].notna()] X = df_clean.iloc[:,0:171] X.columns y = df_clean.iloc[:,171] # dst y.value_counts() ######################### y2 = df_clean.iloc[:,172] #dst_mode y2.value_counts() X_train_tt,X_test_tt, y_train_tt, y_test_tt = train_test_split(X, y, test_size=0.30, random_state=42, stratify = y) y2.value_counts() round(y.value_counts(normalize = True),2) y_train_tt.value_counts() round(y_train_tt.value_counts(normalize = True),2) y_test_tt.value_counts() round(y_test_tt.value_counts(normalize = True),2) dummy_clf = DummyClassifier(strategy="most_frequent") dummy_clf.fit(X_train_tt, y_train_tt) DummyClassifier(strategy='most_frequent') dummy_clf.predict(X_test_tt) # pnca: split 0/30 ======================= Total y count in data: 1.0 114 (62%) 0.0 71 (38%) ======================= ======================= Train y count in data: 1.0 79 (61%) 0.0 50 (39%) ======================= ======================= Test y count in data: 1.0 35 (62%) 0.0 21 (38%) ======================= acccuracy: TP+TN/TP+TN+FP+FN 114/71 ###################################### # try with CV X_eg = np.array([-1, 1, 1, 1, -2, 9, 4, 4, 1, -1, 3, 0]) y_eg = np.array([0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1]) dummy_clf = DummyClassifier(strategy="most_frequent") dummy_clf.fit(X_eg, y_eg) #DummyClassifier(strategy='most_frequent') dummy_clf.predict(X_eg) #dummy_clf.predict(np.array([1,1,1,1,1,1,1,1,1,1])) #dummy_clf.predict_proba(X_eg) dummy_clf.score(X_eg, y_eg) cv_DummyD = cross_validate(dummy_clf , X_eg , y_eg , cv = 5 #, groups = group , scoring = scoring_fn , return_train_score = True) cv_dummyD_ALL= {} cv_dummyD_ALL['DUMMY'] = {} for key, value in cv_DummyD.items(): print('\nkey:', key, '\nvalue:', value) print('\nmean value:', np.mean(value)) cv_dummyD_ALL['DUMMY'][key] = round(np.mean(value),2)