LSHTM_analysis/scripts/ml/dummy_classifier.py

141 lines
No EOL
3.1 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Jul 7 10:54:09 2022
@author: tanu
"""
import numpy as np
from sklearn.dummy import DummyClassifier
X_eg = np.array([-1, 1, 1, 1])
y_eg = np.array([0, 1, 1, 1])
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf = DummyClassifier(strategy="stratified")
dummy_clf = DummyClassifier(strategy="stratified")
dummy_clf.fit(X_eg, y_eg)
#DummyClassifier(strategy='most_frequent')
dummy_clf.predict(X_eg)
dummy_clf.predict(np.array([1,1,1,1,1,1,1,1,1,1]))
dummy_clf.predict_proba(X_eg)
dummy_clf.score(X_eg, y_eg)
0.75
df2['X']
dummy_clf.fit(df2['X'], df2['y'])
dummy_clf.predict(df2['X'])
dummy_clf.predict_proba(df2['X'])
ypred = dummy_clf.predict(df2['X'])
dummy_clf.score(df2['X'], df2['y'])
confusion_matrix(df2['y'], ypred)
matthews_corrcoef(df2['y'], ypred)
#%%
df['dst_mode']
y_all_tt = df.loc[:,'dst']
y_all_tt.value_counts()
#Counter(y_all_tt)
#0: 71, 1: 114
y_all_tt.value_counts(normalize = True)
df2['y']
y_train_tt = df2['y']
Counter(y_train_tt)
##0: 41, 1: 82
y_train_tt.value_counts(normalize = True)
df2['y_bts']
y_bts_tt = df2['y_bts']
Counter(y_bts_tt)
#0: 21, 1: 41
y_bts_tt.value_counts(normalize = True)
#%%
df_clean = df[df['dst'].notna()]
X = df_clean.iloc[:,0:171]
X.columns
y = df_clean.iloc[:,171] # dst
y.value_counts()
#########################
y2 = df_clean.iloc[:,172] #dst_mode
y2.value_counts()
X_train_tt,X_test_tt, y_train_tt, y_test_tt = train_test_split(X, y, test_size=0.30, random_state=42, stratify = y)
y2.value_counts()
round(y.value_counts(normalize = True),2)
y_train_tt.value_counts()
round(y_train_tt.value_counts(normalize = True),2)
y_test_tt.value_counts()
round(y_test_tt.value_counts(normalize = True),2)
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_train_tt, y_train_tt)
DummyClassifier(strategy='most_frequent')
dummy_clf.predict(X_test_tt)
# pnca: split 0/30
=======================
Total y count in data:
1.0 114 (62%)
0.0 71 (38%)
=======================
=======================
Train y count in data:
1.0 79 (61%)
0.0 50 (39%)
=======================
=======================
Test y count in data:
1.0 35 (62%)
0.0 21 (38%)
=======================
acccuracy:
TP+TN/TP+TN+FP+FN
114/71
######################################
# try with CV
X_eg = np.array([-1, 1, 1, 1, -2, 9, 4, 4, 1, -1, 3, 0])
y_eg = np.array([0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1])
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_eg, y_eg)
#DummyClassifier(strategy='most_frequent')
dummy_clf.predict(X_eg)
#dummy_clf.predict(np.array([1,1,1,1,1,1,1,1,1,1]))
#dummy_clf.predict_proba(X_eg)
dummy_clf.score(X_eg, y_eg)
cv_DummyD = cross_validate(dummy_clf
, X_eg
, y_eg
, cv = 5
#, groups = group
, scoring = scoring_fn
, return_train_score = True)
cv_dummyD_ALL= {}
cv_dummyD_ALL['DUMMY'] = {}
for key, value in cv_DummyD.items():
print('\nkey:', key, '\nvalue:', value)
print('\nmean value:', np.mean(value))
cv_dummyD_ALL['DUMMY'][key] = round(np.mean(value),2)