saving work

This commit is contained in:
Tanushree Tunstall 2022-03-16 10:11:13 +00:00
parent a1631ea54b
commit e28a296d98
8 changed files with 153 additions and 212 deletions

View file

@ -77,12 +77,6 @@ def MultClassPipeline2(X_train, X_test, y_train, y_test, input_df):
for clf_name, clf in clfs:
#%%
# pipeline = Pipeline(steps=[
# ('scaler', MinMaxScaler()),
# #('scaler', StandardScaler()),
# ('classifier', clf)
# ]
# )
# define the data preparation for the columns
t = [('cat', OneHotEncoder(), categorical_ix)
, ('num', MinMaxScaler(), numerical_ix)]

View file

@ -6,51 +6,79 @@ Created on Fri Mar 4 15:25:33 2022
@author: tanu
"""
#%%
import os, sys
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
import pprint as pp
#from copy import deepcopy
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef
from sklearn.metrics import make_scorer
from sklearn.metrics import classification_report
from sklearn.metrics import average_precision_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
import itertools
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
print(np.__version__)
print(pd.__version__)
from statistics import mean, stdev, median, mode
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
#from sklearn.datasets import make_classification
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import AdaBoostClassifier
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import EditedNearestNeighbours
#%%
rs = {'random_state': 42}
# Done: add preprocessing step with one hot encoder
# TODO: supply stratified K-fold cv train and test data
# TODO: get accuracy and other scores through K-fold cv
# Done: get accuracy and other scores through K-fold stratified cv
scoring_fn = ({ 'fscore' : make_scorer(f1_score)
, 'mcc' : make_scorer(matthews_corrcoef)
, 'precision' : make_scorer(precision_score)
, 'recall' : make_scorer(recall_score)
, 'accuracy' : make_scorer(accuracy_score)
, 'roc_auc' : make_scorer(roc_auc_score)
#, 'jaccard' : make_scorer(jaccard_score)
})
# Multiple Classification - Model Pipeline
def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical','mixed'], skf_splits = 10):
def MultClassPipelineCV(X_train, X_test, y_train, y_test, input_df, var_type = ['numerical', 'categorical','mixed']):
'''
@ param input_df: input features
@ type: df (gets converted to np.array for stratified Kfold, and helps identify names to apply column transformation)
@param y_outputF: target (or output) feature
@type: df or np.array
returns
multiple classification model scores
'''
# Determine categorical and numerical features
# determine categorical and numerical features
numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
numerical_ix
categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns
@ -70,128 +98,66 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical'
col_transform = ColumnTransformer(transformers = t
, remainder='passthrough')
#%% Define classification models to run
#%%
log_reg = LogisticRegression(**rs)
nb = BernoulliNB()
knn = KNeighborsClassifier()
svm = SVC(**rs)
mlp = MLPClassifier(max_iter = 500, **rs)
mlp = MLPClassifier(max_iter=500, **rs)
dt = DecisionTreeClassifier(**rs)
et = ExtraTreesClassifier(**rs)
rf = RandomForestClassifier(**rs)
rf2 = RandomForestClassifier(
min_samples_leaf = 50,
n_estimators = 150,
bootstrap = True,
oob_score = True,
n_jobs = -1,
random_state = 42,
max_features = 'auto')
min_samples_leaf=50,
n_estimators=150,
bootstrap=True,
oob_score=True,
n_jobs=-1,
random_state=42,
max_features='auto')
xgb = XGBClassifier(**rs, verbosity = 0)
xgb = XGBClassifier(**rs, verbosity=0)
clfs = [
('Logistic Regression' , log_reg)
#, ('Naive Bayes' , nb)
, ('K-Nearest Neighbors', knn)
, ('SVM' , svm)
, ('MLP' , mlp)
, ('Decision Tree' , dt)
, ('Extra Trees' , et)
, ('Random Forest' , rf)
, ('Naive Bayes' , nb)
#, ('Random Forest2' , rf2)
#, ('XGBoost' , xgb)
models = [
('Logistic Regression', log_reg),
('Naive Bayes', nb),
('K-Nearest Neighbors', knn),
('SVM', svm),
('MLP', mlp),
('Decision Tree', dt),
('Extra Trees', et),
('Random Forest', rf),
('Random Forest2', rf2),
#('XGBoost', xgb)
]
skf = StratifiedKFold(n_splits = skf_splits
, shuffle = True
#, random_state = seed_skf
, **rs)
skf_cv_scores = {}
X_array = np.array(input_df)
Y = y_targetF
for model_name, model_fn in models:
print('\nModel_name:', model_name
, '\nModel func:' , model_fn
, '\nList of models:', models)
# Initialise score metrics list to store skf results
# fscoreL = []
# mccL = []
# presL = []
# recallL = []
# accuL = []
# roc_aucL = []
skf_dict = {}
# model_pipeline = Pipeline([
# ('pre' , MinMaxScaler())
# , ('model' , model_fn)])
#scores_df = pd.DataFrame()
for train_index, test_index in skf.split(input_df, y_targetF):
x_train_fold, x_test_fold = input_df.iloc[train_index], input_df.iloc[test_index]
y_train_fold, y_test_fold = y_targetF.iloc[train_index], y_targetF.iloc[test_index]
#fscoreL = {}
model_pipeline = Pipeline([
('prep' , col_transform)
, ('model' , model_fn)])
# for train_index, test_index in skf.split(X_array, Y):
# print('\nSKF train index:', train_index
# , '\nSKF test index:', test_index)
# x_train_fold, x_test_fold = X_array[train_index], X_array[test_index]
# y_train_fold, y_test_fold = Y[train_index], Y[test_index]
print('Running model pipeline:', model_pipeline)
skf_cv = cross_validate(model_pipeline
, X_train
, y_train
, cv = 10
, scoring = scoring_fn
, return_train_score = True)
skf_cv_scores[model_name] = {}
for key, value in skf_cv.items():
print('\nkey:', key, '\nvalue:', value)
print('\nmean value:', mean(value))
skf_cv_scores[model_name][key] = round(mean(value),2)
#pp.pprint(skf_cv_scores)
return(skf_cv_scores)
clf_scores_df = pd.DataFrame()
for clf_name, clf in clfs:
print('\nRunning the following classification models'
, clf_name)
model_pipeline = Pipeline(steps=[('prep' , col_transform)
, ('classifier' , clf)])
# model_pipeline = Pipeline(steps=[('prep' , MinMaxScaler())
# , ('classifier' , clf)])
model_pipeline.fit(x_train_fold, y_train_fold)
y_pred_fold = model_pipeline.predict(x_test_fold)
#----------------
# Model metrics
#----------------
# F1-Score
fscore = f1_score(y_test_fold, y_pred_fold)
fscoreL[clf_name].append(fscore)
print('fscoreL Len: ', len(fscoreL))
#fscoreM = mean(fscoreL[clf])
# Matthews correlation coefficient
mcc = matthews_corrcoef(y_test_fold, y_pred_fold)
mccL[clf_name].append(mcc)
mccM = mean(mccL)
# # Precision
# pres = precision_score(y_test_fold, y_pred_fold)
# presL.append(pres)
# presM = mean(presL)
# # Recall
# recall = recall_score(y_test_fold, y_pred_fold)
# recallL.append(recall)
# recallM = mean(recallL)
# # Accuracy
# accu = accuracy_score(y_test_fold, y_pred_fold)
# accuL.append(accu)
# accuM = mean(accuL)
# # ROC_AUC
# roc_auc = roc_auc_score(y_test_fold, y_pred_fold)
# roc_aucL.append(roc_auc)
# roc_aucM = mean(roc_aucL)
clf_scores_df = clf_scores_df.append({'Model' : clf_name
,'F1_score' : fscoreM
, 'MCC' : mccM
, 'Precision': presM
, 'Recall' : recallM
, 'Accuracy' : accuM
, 'ROC_curve': roc_aucM}
, ignore_index = True)
return(clf_scores_df)
#scores_df = scores_df.append(clf_scores_df)
# return clf_scores_df

View file

@ -8,6 +8,7 @@ Created on Sun Mar 6 13:41:54 2022
import os, sys
import pandas as pd
import numpy as np
import pprint as pp
#from copy import deepcopy
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression, LinearRegression
@ -64,6 +65,8 @@ os.chdir(homedir + "/git/ML_AI_training/")
from MultClassPipe import MultClassPipeline
from MultClassPipe2 import MultClassPipeline2
from loopity_loop import MultClassPipeSKF
from MultClassPipe3 import MultClassPipelineCV
gene = 'pncA'
drug = 'pyrazinamide'

View file

@ -82,13 +82,13 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical'
et = ExtraTreesClassifier(**rs)
rf = RandomForestClassifier(**rs)
rf2 = RandomForestClassifier(
min_samples_leaf = 50,
n_estimators = 150,
bootstrap = True,
oob_score = True,
n_jobs = -1,
random_state = 42,
max_features = 'auto')
min_samples_leaf = 50
, n_estimators = 150
, bootstrap = True
, oob_score = True
, n_jobs = -1
, **rs
, max_features = 'auto')
xgb = XGBClassifier(**rs, verbosity = 0)
classification_metrics = {
@ -97,20 +97,20 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical'
,'Precision': []
,'Recall': []
,'Accuracy': []
#,'ROC_AUC': []
,'ROC_AUC': []
}
models = [
('Logistic Regression' , log_reg)
, ('Naive Bayes' , nb)
, ('K-Nearest Neighbors', knn)
, ('SVM' , svm)
# , ('MLP' , mlp)
# , ('Decision Tree' , dt)
# , ('Extra Trees' , et)
# , ('Random Forest' , rf)
# , ('Naive Bayes' , nb)
, ('MLP' , mlp)
, ('Decision Tree' , dt)
, ('Extra Trees' , et)
, ('Random Forest' , rf)
, ('Naive Bayes' , nb)
#, ('Random Forest2' , rf2)
, ('Random Forest2' , rf2)
#, ('XGBoost' , xgb)
]
@ -118,7 +118,7 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical'
, shuffle = True
, **rs)
skf_dict = {}
# skf_dict = {}
fold_no = 1
fold_dict={}
@ -145,12 +145,12 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical'
#----------------
fscore = f1_score(y_test_fold, y_pred_fold)
mcc = matthews_corrcoef(y_test_fold, y_pred_fold)
#pres = precision_score(y_test_fold, y_pred_fold)
#recall = recall_score(y_test_fold, y_pred_fold)
pres = precision_score(y_test_fold, y_pred_fold, zero_division=0)
recall = recall_score(y_test_fold, y_pred_fold, zero_division=0)
pres = precision_score(y_test_fold, y_pred_fold)
recall = recall_score(y_test_fold, y_pred_fold)
#pres = precision_score(y_test_fold, y_pred_fold, zero_division=0)
#recall = recall_score(y_test_fold, y_pred_fold, zero_division=0)
accu = accuracy_score(y_test_fold, y_pred_fold)
#roc_auc = roc_auc_score(y_test_fold, y_pred_fold)
roc_auc = roc_auc_score(y_test_fold, y_pred_fold)
fold=("fold_"+str(fold_no))
@ -165,7 +165,7 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical'
fold_dict[model_name][fold].update({'Precision' : pres})
fold_dict[model_name][fold].update({'Recall' : recall})
fold_dict[model_name][fold].update({'Accuracy' : accu})
#fold_dict[model_name][fold].update({'ROC_AUC' : roc_auc})
fold_dict[model_name][fold].update({'ROC_AUC' : roc_auc})
fold_no +=1
#pp.pprint(skf_dict)

View file

@ -7,55 +7,32 @@ Created on Fri Mar 11 11:15:50 2022
"""
#%%
del(t3_res)
t3_res = MultClassPipeSKF(input_df = numerical_features_df
, y_targetF = target1
# t3_res = MultClassPipeSKF(input_df = numerical_features_df
# , y_targetF = target1
# , var_type = 'numerical'
# , skf_splits = 10)
# pp.pprint(t3_res)
# #print(t3_res)
t3_res = MultClassPipeSKF(input_df = num_df_wtgt[numerical_FN]
, y_targetF = num_df_wtgt['mutation_class']
, var_type = 'numerical'
, skf_splits = 10)
pp.pprint(t3_res)
#print(t3_res)
#%% Manually: mean for each model, each metric
model_name = 'Logistic Regression'
model_name = 'Naive Bayes'
model_name = 'K-Nearest Neighbors'
model_name = 'SVM'
#%%
model_metric = 'F1_score'
log_reg_f1 = []
for key in t3_res[model_name]:
log_reg_f1.append(t3_res[model_name][key][model_metric])
log_reg_f1M = mean(log_reg_f1)
print('key:', key, model_metric, ':', log_reg_f1)
print(log_reg_f1M)
log_reg_f1df = pd.DataFrame({model_name: [log_reg_f1M]}, index = [model_metric])
log_reg_f1df
#%%
model_metric = 'MCC'
log_reg_mcc = []
for key in t3_res[model_name]:
log_reg_mcc.append(t3_res[model_name][key][model_metric])
log_reg_mccM = mean(log_reg_mcc)
print('key:', key, model_metric, ':', log_reg_mcc)
print(log_reg_mccM)
log_reg_mccdf = pd.DataFrame({model_name: [log_reg_mccM]}, index = [model_metric])
log_reg_mccdf
#%%
################################################################
# extract items from wwithin a nested dict
#%% Classification Metrics we need to mean()
classification_metrics = {
'F1_score': []
,'MCC': []
,'Precision': []
,'Recall': []
,'Accuracy': []
}
# classification_metrics = {
# 'F1_score': []
# ,'MCC': []
# ,'Precision': []
# ,'Recall': []
# ,'Accuracy': []
# ,'ROC_AUC':[]
# }
# "mean() of the current metric across all folds for this model"
# the output containing all the metrics across all folds for this model
out={}
# Just the mean() for each of the above metrics-per-model
@ -64,16 +41,16 @@ out_means={}
# Build up out{} from t3_res, which came from loopity_loop
for model in t3_res:
# NOTE: can't copy objects in Python!!!
out[model]={'F1_score': [], 'MCC': [], 'Precision': [], 'Recall': [], 'Accuracy': []}
out[model]={'F1_score': [], 'MCC': [], 'Precision': [], 'Recall': [], 'Accuracy': [], 'ROC_AUC':[]}
out_means[model]={} # just to make life easier
print(model)
for fold in t3_res[model]:
for metric in {'F1_score': [], 'MCC': [], 'Precision': [], 'Recall': [], 'Accuracy': []}:
for metric in {'F1_score': [], 'MCC': [], 'Precision': [], 'Recall': [], 'Accuracy': [], 'ROC_AUC':[]}:
metric_value = t3_res[model][fold][metric]
out[model][metric].append(metric_value)
# now that we've built out{}, let's mean() each metric
for model in out:
for metric in {'F1_score': [], 'MCC': [], 'Precision': [], 'Recall': [], 'Accuracy': []}:
for metric in {'F1_score': [], 'MCC': [], 'Precision': [], 'Recall': [], 'Accuracy': [], 'ROC_AUC':[]}:
metric_mean = mean(out[model][metric])
# just some debug output
# print('model:', model
@ -84,3 +61,4 @@ for model in out:
out_means[model].update({(metric+'_mean'): metric_mean })
out_scores = pd.DataFrame(out_means)
out_scores2 = round(out_scores, 2)