saving work

This commit is contained in:
Tanushree Tunstall 2022-03-16 10:11:13 +00:00
parent a1631ea54b
commit e28a296d98
8 changed files with 153 additions and 212 deletions

View file

@ -77,12 +77,6 @@ def MultClassPipeline2(X_train, X_test, y_train, y_test, input_df):
for clf_name, clf in clfs: for clf_name, clf in clfs:
#%% #%%
# pipeline = Pipeline(steps=[
# ('scaler', MinMaxScaler()),
# #('scaler', StandardScaler()),
# ('classifier', clf)
# ]
# )
# define the data preparation for the columns # define the data preparation for the columns
t = [('cat', OneHotEncoder(), categorical_ix) t = [('cat', OneHotEncoder(), categorical_ix)
, ('num', MinMaxScaler(), numerical_ix)] , ('num', MinMaxScaler(), numerical_ix)]

View file

@ -6,51 +6,79 @@ Created on Fri Mar 4 15:25:33 2022
@author: tanu @author: tanu
""" """
#%% #%%
import os, sys import os, sys
import pandas as pd import pandas as pd
import numpy as np import numpy as np
from sklearn.linear_model import LogisticRegression import pprint as pp
#from copy import deepcopy
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.naive_bayes import BernoulliNB from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler from sklearn.compose import make_column_transformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef
from sklearn.metrics import make_scorer
from sklearn.metrics import classification_report
from sklearn.metrics import average_precision_score
from sklearn.model_selection import cross_validate from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
import itertools
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
print(np.__version__)
print(pd.__version__)
from statistics import mean, stdev, median, mode from statistics import mean, stdev, median, mode
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
#from sklearn.datasets import make_classification
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import AdaBoostClassifier
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import EditedNearestNeighbours
#%% #%%
rs = {'random_state': 42} rs = {'random_state': 42}
# Done: add preprocessing step with one hot encoder # Done: add preprocessing step with one hot encoder
# TODO: supply stratified K-fold cv train and test data # Done: get accuracy and other scores through K-fold stratified cv
# TODO: get accuracy and other scores through K-fold cv
scoring_fn = ({ 'fscore' : make_scorer(f1_score)
, 'mcc' : make_scorer(matthews_corrcoef)
, 'precision' : make_scorer(precision_score)
, 'recall' : make_scorer(recall_score)
, 'accuracy' : make_scorer(accuracy_score)
, 'roc_auc' : make_scorer(roc_auc_score)
#, 'jaccard' : make_scorer(jaccard_score)
})
# Multiple Classification - Model Pipeline # Multiple Classification - Model Pipeline
def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical','mixed'], skf_splits = 10): def MultClassPipelineCV(X_train, X_test, y_train, y_test, input_df, var_type = ['numerical', 'categorical','mixed']):
''' # determine categorical and numerical features
@ param input_df: input features
@ type: df (gets converted to np.array for stratified Kfold, and helps identify names to apply column transformation)
@param y_outputF: target (or output) feature
@type: df or np.array
returns
multiple classification model scores
'''
# Determine categorical and numerical features
numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
numerical_ix numerical_ix
categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns
@ -69,129 +97,67 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical'
col_transform = ColumnTransformer(transformers = t col_transform = ColumnTransformer(transformers = t
, remainder='passthrough') , remainder='passthrough')
#%% Define classification models to run #%%
log_reg = LogisticRegression(**rs) log_reg = LogisticRegression(**rs)
nb = BernoulliNB() nb = BernoulliNB()
knn = KNeighborsClassifier() knn = KNeighborsClassifier()
svm = SVC(**rs) svm = SVC(**rs)
mlp = MLPClassifier(max_iter = 500, **rs) mlp = MLPClassifier(max_iter=500, **rs)
dt = DecisionTreeClassifier(**rs) dt = DecisionTreeClassifier(**rs)
et = ExtraTreesClassifier(**rs) et = ExtraTreesClassifier(**rs)
rf = RandomForestClassifier(**rs) rf = RandomForestClassifier(**rs)
rf2 = RandomForestClassifier( rf2 = RandomForestClassifier(
min_samples_leaf = 50, min_samples_leaf=50,
n_estimators = 150, n_estimators=150,
bootstrap = True, bootstrap=True,
oob_score = True, oob_score=True,
n_jobs = -1, n_jobs=-1,
random_state = 42, random_state=42,
max_features = 'auto') max_features='auto')
xgb = XGBClassifier(**rs, verbosity = 0) xgb = XGBClassifier(**rs, verbosity=0)
clfs = [ models = [
('Logistic Regression' , log_reg) ('Logistic Regression', log_reg),
#, ('Naive Bayes' , nb) ('Naive Bayes', nb),
, ('K-Nearest Neighbors', knn) ('K-Nearest Neighbors', knn),
, ('SVM' , svm) ('SVM', svm),
, ('MLP' , mlp) ('MLP', mlp),
, ('Decision Tree' , dt) ('Decision Tree', dt),
, ('Extra Trees' , et) ('Extra Trees', et),
, ('Random Forest' , rf) ('Random Forest', rf),
, ('Naive Bayes' , nb) ('Random Forest2', rf2),
#('XGBoost', xgb)
#, ('Random Forest2' , rf2)
#, ('XGBoost' , xgb)
] ]
skf = StratifiedKFold(n_splits = skf_splits skf_cv_scores = {}
, shuffle = True
#, random_state = seed_skf for model_name, model_fn in models:
, **rs) print('\nModel_name:', model_name
, '\nModel func:' , model_fn
X_array = np.array(input_df) , '\nList of models:', models)
Y = y_targetF
# Initialise score metrics list to store skf results
# fscoreL = []
# mccL = []
# presL = []
# recallL = []
# accuL = []
# roc_aucL = []
skf_dict = {}
#scores_df = pd.DataFrame() # model_pipeline = Pipeline([
for train_index, test_index in skf.split(input_df, y_targetF): # ('pre' , MinMaxScaler())
x_train_fold, x_test_fold = input_df.iloc[train_index], input_df.iloc[test_index] # , ('model' , model_fn)])
y_train_fold, y_test_fold = y_targetF.iloc[train_index], y_targetF.iloc[test_index]
#fscoreL = {} model_pipeline = Pipeline([
('prep' , col_transform)
, ('model' , model_fn)])
print('Running model pipeline:', model_pipeline)
skf_cv = cross_validate(model_pipeline
, X_train
, y_train
, cv = 10
, scoring = scoring_fn
, return_train_score = True)
skf_cv_scores[model_name] = {}
for key, value in skf_cv.items():
print('\nkey:', key, '\nvalue:', value)
print('\nmean value:', mean(value))
skf_cv_scores[model_name][key] = round(mean(value),2)
#pp.pprint(skf_cv_scores)
return(skf_cv_scores)
# for train_index, test_index in skf.split(X_array, Y):
# print('\nSKF train index:', train_index
# , '\nSKF test index:', test_index)
# x_train_fold, x_test_fold = X_array[train_index], X_array[test_index]
# y_train_fold, y_test_fold = Y[train_index], Y[test_index]
clf_scores_df = pd.DataFrame()
for clf_name, clf in clfs:
print('\nRunning the following classification models'
, clf_name)
model_pipeline = Pipeline(steps=[('prep' , col_transform)
, ('classifier' , clf)])
# model_pipeline = Pipeline(steps=[('prep' , MinMaxScaler())
# , ('classifier' , clf)])
model_pipeline.fit(x_train_fold, y_train_fold)
y_pred_fold = model_pipeline.predict(x_test_fold)
#----------------
# Model metrics
#----------------
# F1-Score
fscore = f1_score(y_test_fold, y_pred_fold)
fscoreL[clf_name].append(fscore)
print('fscoreL Len: ', len(fscoreL))
#fscoreM = mean(fscoreL[clf])
# Matthews correlation coefficient
mcc = matthews_corrcoef(y_test_fold, y_pred_fold)
mccL[clf_name].append(mcc)
mccM = mean(mccL)
# # Precision
# pres = precision_score(y_test_fold, y_pred_fold)
# presL.append(pres)
# presM = mean(presL)
# # Recall
# recall = recall_score(y_test_fold, y_pred_fold)
# recallL.append(recall)
# recallM = mean(recallL)
# # Accuracy
# accu = accuracy_score(y_test_fold, y_pred_fold)
# accuL.append(accu)
# accuM = mean(accuL)
# # ROC_AUC
# roc_auc = roc_auc_score(y_test_fold, y_pred_fold)
# roc_aucL.append(roc_auc)
# roc_aucM = mean(roc_aucL)
clf_scores_df = clf_scores_df.append({'Model' : clf_name
,'F1_score' : fscoreM
, 'MCC' : mccM
, 'Precision': presM
, 'Recall' : recallM
, 'Accuracy' : accuM
, 'ROC_curve': roc_aucM}
, ignore_index = True)
return(clf_scores_df)
#scores_df = scores_df.append(clf_scores_df)
# return clf_scores_df

View file

@ -8,6 +8,7 @@ Created on Sun Mar 6 13:41:54 2022
import os, sys import os, sys
import pandas as pd import pandas as pd
import numpy as np import numpy as np
import pprint as pp
#from copy import deepcopy #from copy import deepcopy
from sklearn import linear_model from sklearn import linear_model
from sklearn.linear_model import LogisticRegression, LinearRegression from sklearn.linear_model import LogisticRegression, LinearRegression
@ -64,6 +65,8 @@ os.chdir(homedir + "/git/ML_AI_training/")
from MultClassPipe import MultClassPipeline from MultClassPipe import MultClassPipeline
from MultClassPipe2 import MultClassPipeline2 from MultClassPipe2 import MultClassPipeline2
from loopity_loop import MultClassPipeSKF from loopity_loop import MultClassPipeSKF
from MultClassPipe3 import MultClassPipelineCV
gene = 'pncA' gene = 'pncA'
drug = 'pyrazinamide' drug = 'pyrazinamide'

View file

@ -82,13 +82,13 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical'
et = ExtraTreesClassifier(**rs) et = ExtraTreesClassifier(**rs)
rf = RandomForestClassifier(**rs) rf = RandomForestClassifier(**rs)
rf2 = RandomForestClassifier( rf2 = RandomForestClassifier(
min_samples_leaf = 50, min_samples_leaf = 50
n_estimators = 150, , n_estimators = 150
bootstrap = True, , bootstrap = True
oob_score = True, , oob_score = True
n_jobs = -1, , n_jobs = -1
random_state = 42, , **rs
max_features = 'auto') , max_features = 'auto')
xgb = XGBClassifier(**rs, verbosity = 0) xgb = XGBClassifier(**rs, verbosity = 0)
classification_metrics = { classification_metrics = {
@ -97,20 +97,20 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical'
,'Precision': [] ,'Precision': []
,'Recall': [] ,'Recall': []
,'Accuracy': [] ,'Accuracy': []
#,'ROC_AUC': [] ,'ROC_AUC': []
} }
models = [ models = [
('Logistic Regression' , log_reg) ('Logistic Regression' , log_reg)
, ('Naive Bayes' , nb) , ('Naive Bayes' , nb)
, ('K-Nearest Neighbors', knn) , ('K-Nearest Neighbors', knn)
, ('SVM' , svm) , ('SVM' , svm)
# , ('MLP' , mlp) , ('MLP' , mlp)
# , ('Decision Tree' , dt) , ('Decision Tree' , dt)
# , ('Extra Trees' , et) , ('Extra Trees' , et)
# , ('Random Forest' , rf) , ('Random Forest' , rf)
# , ('Naive Bayes' , nb) , ('Naive Bayes' , nb)
#, ('Random Forest2' , rf2) , ('Random Forest2' , rf2)
#, ('XGBoost' , xgb) #, ('XGBoost' , xgb)
] ]
@ -118,7 +118,7 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical'
, shuffle = True , shuffle = True
, **rs) , **rs)
skf_dict = {} # skf_dict = {}
fold_no = 1 fold_no = 1
fold_dict={} fold_dict={}
@ -145,12 +145,12 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical'
#---------------- #----------------
fscore = f1_score(y_test_fold, y_pred_fold) fscore = f1_score(y_test_fold, y_pred_fold)
mcc = matthews_corrcoef(y_test_fold, y_pred_fold) mcc = matthews_corrcoef(y_test_fold, y_pred_fold)
#pres = precision_score(y_test_fold, y_pred_fold) pres = precision_score(y_test_fold, y_pred_fold)
#recall = recall_score(y_test_fold, y_pred_fold) recall = recall_score(y_test_fold, y_pred_fold)
pres = precision_score(y_test_fold, y_pred_fold, zero_division=0) #pres = precision_score(y_test_fold, y_pred_fold, zero_division=0)
recall = recall_score(y_test_fold, y_pred_fold, zero_division=0) #recall = recall_score(y_test_fold, y_pred_fold, zero_division=0)
accu = accuracy_score(y_test_fold, y_pred_fold) accu = accuracy_score(y_test_fold, y_pred_fold)
#roc_auc = roc_auc_score(y_test_fold, y_pred_fold) roc_auc = roc_auc_score(y_test_fold, y_pred_fold)
fold=("fold_"+str(fold_no)) fold=("fold_"+str(fold_no))
@ -165,7 +165,7 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical'
fold_dict[model_name][fold].update({'Precision' : pres}) fold_dict[model_name][fold].update({'Precision' : pres})
fold_dict[model_name][fold].update({'Recall' : recall}) fold_dict[model_name][fold].update({'Recall' : recall})
fold_dict[model_name][fold].update({'Accuracy' : accu}) fold_dict[model_name][fold].update({'Accuracy' : accu})
#fold_dict[model_name][fold].update({'ROC_AUC' : roc_auc}) fold_dict[model_name][fold].update({'ROC_AUC' : roc_auc})
fold_no +=1 fold_no +=1
#pp.pprint(skf_dict) #pp.pprint(skf_dict)

View file

@ -7,55 +7,32 @@ Created on Fri Mar 11 11:15:50 2022
""" """
#%% #%%
del(t3_res) del(t3_res)
t3_res = MultClassPipeSKF(input_df = numerical_features_df # t3_res = MultClassPipeSKF(input_df = numerical_features_df
, y_targetF = target1 # , y_targetF = target1
# , var_type = 'numerical'
# , skf_splits = 10)
# pp.pprint(t3_res)
# #print(t3_res)
t3_res = MultClassPipeSKF(input_df = num_df_wtgt[numerical_FN]
, y_targetF = num_df_wtgt['mutation_class']
, var_type = 'numerical' , var_type = 'numerical'
, skf_splits = 10) , skf_splits = 10)
pp.pprint(t3_res) pp.pprint(t3_res)
#print(t3_res) #print(t3_res)
#%% Manually: mean for each model, each metric
model_name = 'Logistic Regression'
model_name = 'Naive Bayes'
model_name = 'K-Nearest Neighbors'
model_name = 'SVM'
#%%
model_metric = 'F1_score'
log_reg_f1 = []
for key in t3_res[model_name]:
log_reg_f1.append(t3_res[model_name][key][model_metric])
log_reg_f1M = mean(log_reg_f1)
print('key:', key, model_metric, ':', log_reg_f1)
print(log_reg_f1M)
log_reg_f1df = pd.DataFrame({model_name: [log_reg_f1M]}, index = [model_metric])
log_reg_f1df
#%%
model_metric = 'MCC'
log_reg_mcc = []
for key in t3_res[model_name]:
log_reg_mcc.append(t3_res[model_name][key][model_metric])
log_reg_mccM = mean(log_reg_mcc)
print('key:', key, model_metric, ':', log_reg_mcc)
print(log_reg_mccM)
log_reg_mccdf = pd.DataFrame({model_name: [log_reg_mccM]}, index = [model_metric])
log_reg_mccdf
#%%
################################################################ ################################################################
# extract items from wwithin a nested dict # extract items from wwithin a nested dict
#%% Classification Metrics we need to mean() #%% Classification Metrics we need to mean()
classification_metrics = { # classification_metrics = {
'F1_score': [] # 'F1_score': []
,'MCC': [] # ,'MCC': []
,'Precision': [] # ,'Precision': []
,'Recall': [] # ,'Recall': []
,'Accuracy': [] # ,'Accuracy': []
} # ,'ROC_AUC':[]
# }
# "mean() of the current metric across all folds for this model" # "mean() of the current metric across all folds for this model"
# the output containing all the metrics across all folds for this model # the output containing all the metrics across all folds for this model
out={} out={}
# Just the mean() for each of the above metrics-per-model # Just the mean() for each of the above metrics-per-model
@ -64,16 +41,16 @@ out_means={}
# Build up out{} from t3_res, which came from loopity_loop # Build up out{} from t3_res, which came from loopity_loop
for model in t3_res: for model in t3_res:
# NOTE: can't copy objects in Python!!! # NOTE: can't copy objects in Python!!!
out[model]={'F1_score': [], 'MCC': [], 'Precision': [], 'Recall': [], 'Accuracy': []} out[model]={'F1_score': [], 'MCC': [], 'Precision': [], 'Recall': [], 'Accuracy': [], 'ROC_AUC':[]}
out_means[model]={} # just to make life easier out_means[model]={} # just to make life easier
print(model) print(model)
for fold in t3_res[model]: for fold in t3_res[model]:
for metric in {'F1_score': [], 'MCC': [], 'Precision': [], 'Recall': [], 'Accuracy': []}: for metric in {'F1_score': [], 'MCC': [], 'Precision': [], 'Recall': [], 'Accuracy': [], 'ROC_AUC':[]}:
metric_value = t3_res[model][fold][metric] metric_value = t3_res[model][fold][metric]
out[model][metric].append(metric_value) out[model][metric].append(metric_value)
# now that we've built out{}, let's mean() each metric # now that we've built out{}, let's mean() each metric
for model in out: for model in out:
for metric in {'F1_score': [], 'MCC': [], 'Precision': [], 'Recall': [], 'Accuracy': []}: for metric in {'F1_score': [], 'MCC': [], 'Precision': [], 'Recall': [], 'Accuracy': [], 'ROC_AUC':[]}:
metric_mean = mean(out[model][metric]) metric_mean = mean(out[model][metric])
# just some debug output # just some debug output
# print('model:', model # print('model:', model
@ -84,3 +61,4 @@ for model in out:
out_means[model].update({(metric+'_mean'): metric_mean }) out_means[model].update({(metric+'_mean'): metric_mean })
out_scores = pd.DataFrame(out_means) out_scores = pd.DataFrame(out_means)
out_scores2 = round(out_scores, 2)