From 1bfb35c30c9e54f9090b0dc800c19bcc6ad07759 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Wed, 9 Mar 2022 18:35:54 +0000 Subject: [PATCH] trying Stratified Kfold split on running multiple pipelines --- MultClassPipe.py | 29 +-- MultClassPipe2.py | 28 +-- __pycache__/MultClassPipe.cpython-37.pyc | Bin 2451 -> 2530 bytes imports.py | 11 +- my_data10.py | 255 +++++++++++++++++++---- my_data9.py | 24 ++- pnca_results_v1.py | 12 ++ 7 files changed, 287 insertions(+), 72 deletions(-) diff --git a/MultClassPipe.py b/MultClassPipe.py index 217bbe9..44506aa 100644 --- a/MultClassPipe.py +++ b/MultClassPipe.py @@ -20,7 +20,8 @@ from xgboost import XGBClassifier from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import MinMaxScaler from sklearn.model_selection import train_test_split -from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score +from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score +from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef #%% rs = {'random_state': 42} # TODO: add preprocessing step with one hot encoder @@ -63,7 +64,7 @@ def MultClassPipeline(X_train, X_test, y_train, y_test): pipelines = [] - scores_df = pd.DataFrame(columns=['Model', 'F1_Score', 'Precision', 'Recall', 'Accuracy', 'ROC_AUC']) + scores_df = pd.DataFrame(columns=['Model', 'F1_Score', 'MCC', 'Precision', 'Recall', 'Accuracy', 'ROC_AUC']) for clf_name, clf in clfs: @@ -83,24 +84,26 @@ def MultClassPipeline(X_train, X_test, y_train, y_test): # Precision pres = precision_score(y_test, y_pred) # Recall - rcall = recall_score(y_test, y_pred) + recall = recall_score(y_test, y_pred) # Accuracy accu = accuracy_score(y_test, y_pred) # ROC_AUC roc_auc = roc_auc_score(y_test, y_pred) - + # Matthews correlation coefficient + mcc = matthews_corrcoef(y_test, y_pred) + pipelines.append(pipeline) scores_df = scores_df.append({ - 'Model' : clf_name, - 'F1_Score' : fscore, - 'Precision' : pres, - 'Recall' : rcall, - 'Accuracy' : accu, - 'ROC_AUC' : roc_auc - - }, - ignore_index = True) + 'Model' : clf_name + , 'F1_Score' : fscore + , 'MCC' : mcc + , 'Precision' : pres + , 'Recall' : recall + , 'Accuracy' : accu + , 'ROC_AUC' : roc_auc + } + , ignore_index = True) return pipelines, scores_df diff --git a/MultClassPipe2.py b/MultClassPipe2.py index e4ea381..9fe4619 100644 --- a/MultClassPipe2.py +++ b/MultClassPipe2.py @@ -21,7 +21,8 @@ from sklearn.compose import ColumnTransformer from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import MinMaxScaler, OneHotEncoder from sklearn.model_selection import train_test_split -from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score +from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score +from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef #%% rs = {'random_state': 42} # Done: add preprocessing step with one hot encoder @@ -70,10 +71,9 @@ def MultClassPipeline2(X_train, X_test, y_train, y_test, input_df): ('XGBoost', xgb) ] - pipelines = [] - scores_df = pd.DataFrame(columns=['Model', 'F1_Score', 'Precision', 'Recall', 'Accuracy', 'ROC_AUC']) + scores_df = pd.DataFrame(columns=['Model', 'F1_Score', 'MCC', 'Precision', 'Recall', 'Accuracy', 'ROC_AUC']) for clf_name, clf in clfs: #%% @@ -101,10 +101,12 @@ def MultClassPipeline2(X_train, X_test, y_train, y_test, input_df): # F1-Score fscore = f1_score(y_test, y_pred) + # Matthews correlation coefficient + mcc = matthews_corrcoef(y_test, y_pred) # Precision pres = precision_score(y_test, y_pred) # Recall - rcall = recall_score(y_test, y_pred) + recall = recall_score(y_test, y_pred) # Accuracy accu = accuracy_score(y_test, y_pred) # ROC_AUC @@ -113,15 +115,15 @@ def MultClassPipeline2(X_train, X_test, y_train, y_test, input_df): pipelines.append(pipeline) scores_df = scores_df.append({ - 'Model' : clf_name, - 'F1_Score' : fscore, - 'Precision' : pres, - 'Recall' : rcall, - 'Accuracy' : accu, - 'ROC_AUC' : roc_auc - - }, - ignore_index = True) + 'Model' : clf_name + , 'F1_Score' : fscore + , 'MCC' : mcc + , 'Precision' : pres + , 'Recall' : recall + , 'Accuracy' : accu + , 'ROC_AUC' : roc_auc + } + , ignore_index = True) return pipelines, scores_df diff --git a/__pycache__/MultClassPipe.cpython-37.pyc b/__pycache__/MultClassPipe.cpython-37.pyc index 2156ad951b720a9ddd80148d090878057ef51153..b6c5c1bead891a2eca7e055f56ae121e74a25e40 100644 GIT binary patch delta 551 zcmY+Azi-n(6vurR$1Wvtoj5<-q;1GA8bkRZ1WbSsK_HmTO0*QgmLJa%`44s&`+u)tvM|$u3J>7f9j9-R!E1R8{cx-&> zJbJI~YbH3}U;TU$NEl#!wCp$RhTpWCe#>t8ZM)6;Ih-E?`!qIi;iY7+Vz?n$#iIha z0Ge{5qcHF$(J%~pz7vz-G|?P{Cd5I{dl3qW%RE?vmdswkc0R`~@xXl+^|(S%=z8oB zE@@Rh&PkqKp3w{T9e!KGOMMkD?}3>}t4M}O42;F5TbTow=-+58_{yiDe< X`5~T++#4#dq81BI)D;t$a8CXUc(saY delta 521 zcmY+APiqrF7{+&IH?wQqY&KbwrqybbrcJjh;-8~>h@v3kp%sKhS=OCRx{}?EGYO

nf={np9aK|5OsR4S!YrDI4$drEIQ_Y z;g5|0!sm9u%$>et&Ry-<+oy0=oz=!RUK*EQYki2%vsxbxAiltV>kn_R%6^@B9|6#q z`=ZXOtj-#2o;BG5UVe{o3Ac9P%zY`IXsp8AzKYT2uZ$L$=O2s*puvxfFZ@&K(N#pc z(gayo@D(-`E-GA-@PV+?K`zsVX^xYr-{3yl_A2V#lMpBxjglPui~IoXFO-N%D-88f z!X!l;Mw9qJBb$8NyoZ*gfMWjHyyY~Z10k@WquctN&VkkJBHf1MvD(=cW*A~A4h1Zz y(CH2pXcXl*qU8Ev&$(