diff --git a/MultClassPipe.py b/MultClassPipe.py index 217bbe9..44506aa 100644 --- a/MultClassPipe.py +++ b/MultClassPipe.py @@ -20,7 +20,8 @@ from xgboost import XGBClassifier from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import MinMaxScaler from sklearn.model_selection import train_test_split -from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score +from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score +from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef #%% rs = {'random_state': 42} # TODO: add preprocessing step with one hot encoder @@ -63,7 +64,7 @@ def MultClassPipeline(X_train, X_test, y_train, y_test): pipelines = [] - scores_df = pd.DataFrame(columns=['Model', 'F1_Score', 'Precision', 'Recall', 'Accuracy', 'ROC_AUC']) + scores_df = pd.DataFrame(columns=['Model', 'F1_Score', 'MCC', 'Precision', 'Recall', 'Accuracy', 'ROC_AUC']) for clf_name, clf in clfs: @@ -83,24 +84,26 @@ def MultClassPipeline(X_train, X_test, y_train, y_test): # Precision pres = precision_score(y_test, y_pred) # Recall - rcall = recall_score(y_test, y_pred) + recall = recall_score(y_test, y_pred) # Accuracy accu = accuracy_score(y_test, y_pred) # ROC_AUC roc_auc = roc_auc_score(y_test, y_pred) - + # Matthews correlation coefficient + mcc = matthews_corrcoef(y_test, y_pred) + pipelines.append(pipeline) scores_df = scores_df.append({ - 'Model' : clf_name, - 'F1_Score' : fscore, - 'Precision' : pres, - 'Recall' : rcall, - 'Accuracy' : accu, - 'ROC_AUC' : roc_auc - - }, - ignore_index = True) + 'Model' : clf_name + , 'F1_Score' : fscore + , 'MCC' : mcc + , 'Precision' : pres + , 'Recall' : recall + , 'Accuracy' : accu + , 'ROC_AUC' : roc_auc + } + , ignore_index = True) return pipelines, scores_df diff --git a/MultClassPipe2.py b/MultClassPipe2.py index e4ea381..9fe4619 100644 --- a/MultClassPipe2.py +++ b/MultClassPipe2.py @@ -21,7 +21,8 @@ from sklearn.compose import ColumnTransformer from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import MinMaxScaler, OneHotEncoder from sklearn.model_selection import train_test_split -from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score +from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score +from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef #%% rs = {'random_state': 42} # Done: add preprocessing step with one hot encoder @@ -70,10 +71,9 @@ def MultClassPipeline2(X_train, X_test, y_train, y_test, input_df): ('XGBoost', xgb) ] - pipelines = [] - scores_df = pd.DataFrame(columns=['Model', 'F1_Score', 'Precision', 'Recall', 'Accuracy', 'ROC_AUC']) + scores_df = pd.DataFrame(columns=['Model', 'F1_Score', 'MCC', 'Precision', 'Recall', 'Accuracy', 'ROC_AUC']) for clf_name, clf in clfs: #%% @@ -101,10 +101,12 @@ def MultClassPipeline2(X_train, X_test, y_train, y_test, input_df): # F1-Score fscore = f1_score(y_test, y_pred) + # Matthews correlation coefficient + mcc = matthews_corrcoef(y_test, y_pred) # Precision pres = precision_score(y_test, y_pred) # Recall - rcall = recall_score(y_test, y_pred) + recall = recall_score(y_test, y_pred) # Accuracy accu = accuracy_score(y_test, y_pred) # ROC_AUC @@ -113,15 +115,15 @@ def MultClassPipeline2(X_train, X_test, y_train, y_test, input_df): pipelines.append(pipeline) scores_df = scores_df.append({ - 'Model' : clf_name, - 'F1_Score' : fscore, - 'Precision' : pres, - 'Recall' : rcall, - 'Accuracy' : accu, - 'ROC_AUC' : roc_auc - - }, - ignore_index = True) + 'Model' : clf_name + , 'F1_Score' : fscore + , 'MCC' : mcc + , 'Precision' : pres + , 'Recall' : recall + , 'Accuracy' : accu + , 'ROC_AUC' : roc_auc + } + , ignore_index = True) return pipelines, scores_df diff --git a/__pycache__/MultClassPipe.cpython-37.pyc b/__pycache__/MultClassPipe.cpython-37.pyc index 2156ad9..b6c5c1b 100644 Binary files a/__pycache__/MultClassPipe.cpython-37.pyc and b/__pycache__/MultClassPipe.cpython-37.pyc differ diff --git a/imports.py b/imports.py index 0735ec6..2eaf070 100644 --- a/imports.py +++ b/imports.py @@ -21,12 +21,15 @@ from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder from sklearn.compose import ColumnTransformer from sklearn.compose import make_column_transformer -from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score + +from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score +from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef from sklearn.metrics import make_scorer from sklearn.metrics import classification_report from sklearn.model_selection import cross_validate from sklearn.model_selection import train_test_split +from sklearn.model_selection import StratifiedKFold from sklearn.pipeline import Pipeline from sklearn.pipeline import make_pipeline @@ -39,13 +42,15 @@ import matplotlib.pyplot as plt import numpy as np print(np.__version__) print(pd.__version__) -from statistics import mean, stdev +from statistics import mean, stdev, median, mode #%% homedir = os.path.expanduser("~") os.chdir(homedir + "/git/ML_AI_training/") # my function -from MultClassPipe import MultClassPipeline +from MultClassPipe import MultClassPipeline +from MultClassPipe2 import MultClassPipeline2 +from MultClassPipe3 import MultClassPipeSKF gene = 'pncA' drug = 'pyrazinamide' diff --git a/my_data10.py b/my_data10.py index d37ff41..43148f1 100644 --- a/my_data10.py +++ b/my_data10.py @@ -6,14 +6,11 @@ Created on Sat Mar 5 12:57:32 2022 @author: tanu """ #%% -# data, etc for now comes from my_data6.py and/or my_data5.py -#%% +# Data, etc for now comes from my_data6.py and/or my_data5.py +#%% Specify dir and import functions homedir = os.path.expanduser("~") os.chdir(homedir + "/git/ML_AI_training/") - -# my function -from MultClassPipe2 import MultClassPipeline2 -#%% try combinations +#%% Try combinations #import sys, os #os.system("imports.py") def precision(y_true,y_pred): @@ -23,13 +20,12 @@ def recall(y_true,y_pred): def f1(y_true,y_pred): return f1_score(y_true, y_pred, pos_label = 1) -#%% - +#%% Check df features numerical_features_df.shape categorical_features_df.shape all_features_df.shape all_features_df.dtypes -#%% +#%% Simple train and test data splits target = target1 #target = target3 X_trainN, X_testN, y_trainN, y_testN = train_test_split(numerical_features_df, @@ -46,44 +42,231 @@ X_train, X_test, y_train, y_test = train_test_split(all_features_df, target, test_size = 0.33, random_state = 42) -#%% +#%% Stratified K-fold: Single model +model1 = Pipeline(steps = [('preprocess', MinMaxScaler()) + , ('log_reg', LogisticRegression(class_weight = 'balanced')) ]) +model1 +rs = {'random_state': 42} +log_reg = LogisticRegression(**rs) +nb = BernoulliNB() +clfs = [('Logistic Regression', log_reg) + ,('Naive Bayes', nb)] +seed_skf = 42 +skf = StratifiedKFold(n_splits = 10 + , shuffle = True + , random_state = seed_skf) -#%% with feature selection +X_array = np.array(numerical_features_df) +Y = target1 -# Determine categorical and numerical features -input_df = numerical_features_df.copy() -#input_df = categorical_features_df -#input_df = all_features_df +model_scores_df = pd.DataFrame() +fscoreL = [] +mccL = [] +presL = [] +recallL = [] +accuL = [] +roc_aucL = [] -numerical_ix = all_features_df.select_dtypes(include=['int64', 'float64']).columns +for train_index, test_index in skf.split(X_array, Y): + x_train_fold, x_test_fold = X_array[train_index], X_array[test_index] + y_train_fold, y_test_fold = Y[train_index], Y[test_index] + + model1.fit(x_train_fold, y_train_fold) + y_pred_fold = model1.predict(x_test_fold) + + #---------------- + # Model metrics + #---------------- + # F1-Score + fscore = f1_score(y_test_fold, y_pred_fold) + fscoreL.append(fscore) + fscoreM = mean(fscoreL) + + # Matthews correlation coefficient + mcc = matthews_corrcoef(y_test_fold, y_pred_fold) + mccL.append(mcc) + mccM = mean(mccL) + + # Precision + pres = precision_score(y_test_fold, y_pred_fold) + presL.append(pres) + presM = mean(presL) + + # Recall + recall = recall_score(y_test_fold, y_pred_fold) + recallL.append(recall) + recallM = mean(recallL) + + # Accuracy + accu = accuracy_score(y_test_fold, y_pred_fold) + accuL.append(accu) + accuM = mean(accuL) + + # ROC_AUC + roc_auc = roc_auc_score(y_test_fold, y_pred_fold) + roc_aucL.append(roc_auc) + roc_aucM = mean(roc_aucL) + +model_scores_df = model_scores_df.append({'Model' : model1.steps[1][0] + ,'F1_score' : fscoreM + , 'MCC' : mccM + , 'Precision': presM + , 'Recall' : recallM + , 'Accuracy' : accuM + , 'ROC_curve': roc_aucM} + , ignore_index = True) +print('\nModel metrics:', model_scores_df) +#%% stratified KFold: Multiple_models: +input_df = numerical_features_df +#X_array = np.array(input_df) +Y = target1 +var_type = 'numerical' + +input_df = all_features_df +#X_array = np.array(input_df) +Y = target1 +var_type = 'mixed' + +input_df = categorical_features_df +#X_array = np.array(input_df) +Y = target1 +var_type = 'categorical' + +#================= +numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns numerical_ix -categorical_ix = all_features_df.select_dtypes(include=['object', 'bool']).columns -categorical_ix -# prepare data -t = [('num', MinMaxScaler(), numerical_ix) - , ('cat', OneHotEncoder(), categorical_ix)] - +categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns +categorical_ix + +# Determine preprocessing steps ~ var_type +if var_type == 'numerical': + t = [('num', MinMaxScaler(), numerical_ix)] + +if var_type == 'categorical': + t = [('cat', OneHotEncoder(), categorical_ix)] + +if var_type == 'mixed': + t = [('cat', OneHotEncoder(), categorical_ix) + , ('num', MinMaxScaler(), numerical_ix)] + +############################## col_transform = ColumnTransformer(transformers = t - , remainder = 'passthrough') + , remainder='passthrough') -# model pipeline -model = Pipeline(steps=[('prep', col_transform) - , ('classifier', LogisticRegression())]) -model.fit(X_train, y_train) -y_pred = model.predict(X_test) -y_pred +rs = {'random_state': 42} -selector_log = RFECV(estimator = model - , cv = 10 - , step = 1) +#log_reg = LogisticRegression(**rs) +log_reg = LogisticRegression(class_weight = 'balanced') +nb = BernoulliNB() +rf = RandomForestClassifier(**rs) -selector_log_x = selector_log.fit_transform(X_train, y_train) +clfs = [('Logistic Regression', log_reg) + ,('Naive Bayes', nb) + , ('Random Forest' , rf) + ] -print(selector_log_x.get_support()) -X_trainN.columns +#seed_skf = 42 +skf = StratifiedKFold(n_splits = 10 + , shuffle = True + #, random_state = seed_skf + , **rs) +#scores_df = pd.DataFrame() +fscoreL = [] +mccL = [] +presL = [] +recallL = [] +accuL = [] +roc_aucL = [] -print(selector_logistic_x.ranking_) \ No newline at end of file +for train_index, test_index in skf.split(input_df, Y): + print('\nSKF train index:', train_index + , '\nSKF test index:', test_index) + x_train_fold, x_test_fold = input_df.iloc[train_index], input_df.iloc[test_index] + y_train_fold, y_test_fold = Y.iloc[train_index], Y.iloc[test_index] +# for train_index, test_index in skf.split(X_array, Y): +# print('\nSKF train index:', train_index +# , '\nSKF test index:', test_index) + # x_train_fold, x_test_fold = X_array[train_index], X_array[test_index] + # y_train_fold, y_test_fold = Y[train_index], Y[test_index] + + + clf_scores_df = pd.DataFrame() + for clf_name, clf in clfs: + # model2 = Pipeline(steps=[('preprocess', MinMaxScaler()) + # , ('classifier', clf)]) + model2 = Pipeline(steps=[('preprocess', col_transform) + , ('classifier', clf)]) + + model2.fit(x_train_fold, y_train_fold) + y_pred_fold = model2.predict(x_test_fold) + + #---------------- + # Model metrics + #---------------- + # F1-Score + fscore = f1_score(y_test_fold, y_pred_fold) + fscoreL.append(fscore) + fscoreM = mean(fscoreL) + + # Matthews correlation coefficient + mcc = matthews_corrcoef(y_test_fold, y_pred_fold) + mccL.append(mcc) + mccM = mean(mccL) + + # Precision + pres = precision_score(y_test_fold, y_pred_fold) + presL.append(pres) + presM = mean(presL) + + # Recall + recall = recall_score(y_test_fold, y_pred_fold) + recallL.append(recall) + recallM = mean(recallL) + + # Accuracy + accu = accuracy_score(y_test_fold, y_pred_fold) + accuL.append(accu) + accuM = mean(accuL) + + # ROC_AUC + roc_auc = roc_auc_score(y_test_fold, y_pred_fold) + roc_aucL.append(roc_auc) + roc_aucM = mean(roc_aucL) + + clf_scores_df = clf_scores_df.append({'Model': clf_name + ,'F1_score' : fscoreM + , 'MCC' : mccM + , 'Precision': presM + , 'Recall' : recallM + , 'Accuracy' : accuM + , 'ROC_curve': roc_aucM} + , ignore_index = True) + #scores_df = scores_df.append(clf_scores_df) + + +#%% Call functions + +tN_res = MultClassPipeline(X_trainN, X_testN, y_trainN, y_testN) +tN_res + +t2_res = MultClassPipeline2(X_train, X_test, y_train, y_test, input_df = all_features_df) +t2_res + +#CHECK: numbers are awfully close to each other! + +t3_res = MultClassPipeSKF(input_df = numerical_features_df + , y_targetF = target1 + , var_type = 'numerical' + , skf_splits = 10) +t3_res + +#CHECK: numbers are awfully close to each other! +t4_res = MultClassPipeSKF(input_df = all_features_df + , y_targetF = target1 + , var_type = 'mixed' + , skf_splits = 10) +t4_res \ No newline at end of file diff --git a/my_data9.py b/my_data9.py index 0b1f4b8..7c6d05f 100644 --- a/my_data9.py +++ b/my_data9.py @@ -7,12 +7,6 @@ Created on Sat Mar 5 12:57:32 2022 """ #%% # data, etc for now comes from my_data6.py and/or my_data5.py -#%% -homedir = os.path.expanduser("~") -os.chdir(homedir + "/git/ML_AI_training/") - -# my function -from MultClassPipe2 import MultClassPipeline2 #%% try combinations #import sys, os #os.system("imports.py") @@ -130,5 +124,21 @@ pipeline = Pipeline(steps=[('prep', col_transform) , ('classifier', LogisticRegression())]) #%% Added this to the MultClassPipeline +tN_res = MultClassPipeline(X_trainN, X_testN, y_trainN, y_testN) +tN_res + t2_res = MultClassPipeline2(X_train, X_test, y_train, y_test, input_df = all_features_df) -t2_res \ No newline at end of file +t2_res + +t3_res = MultClassPipeSKF(input_df = numerical_features_df + , y_targetF = target1 + , var_type = 'numerical' + , skf_splits = 10) +t3_res + + +t4_res = MultClassPipeSKF(input_df = all_features_df + , y_targetF = target1 + , var_type = 'mixed' + , skf_splits = 10) +t4_res \ No newline at end of file diff --git a/pnca_results_v1.py b/pnca_results_v1.py index 7d8b097..2c6724d 100644 --- a/pnca_results_v1.py +++ b/pnca_results_v1.py @@ -85,3 +85,15 @@ all_features: numerical_features + ['ss_class', 'wt_prop_water', 'mut_prop_water 9All XGBoost 0.710526 0.692308 0.729730 0.685714 0.683047) +#%% + Model F1_Score Precision Recall Accuracy ROC_AUC + 0 Logistic Regression 0.757764 0.701149 0.824324 0.721429 0.715192 + 1 Naive Bayes 0.628571 0.666667 0.594595 0.628571 0.630631 + 2 K-Nearest Neighbors 0.666667 0.623529 0.716216 0.621429 0.615684 + 3 SVM 0.766467 0.688172 0.864865 0.721429 0.712735 + 4 MLP 0.726115 0.686747 0.770270 0.692857 0.688165 + 5 Decision Tree 0.647482 0.692308 0.608108 0.650000 0.652539 + 6 Extra Trees 0.760736 0.696629 0.837838 0.721429 0.714373 + 7 Random Forest 0.736196 0.674157 0.810811 0.692857 0.685708 + 8 Random Forest2 0.736196 0.674157 0.810811 0.692857 0.685708 + 9 XGBoost 0.710526 0.692308 0.729730 0.685714 0.683047) \ No newline at end of file