diff --git a/MultClassPipe2.py b/MultClassPipe2.py index 9fe4619..20261c2 100644 --- a/MultClassPipe2.py +++ b/MultClassPipe2.py @@ -77,12 +77,6 @@ def MultClassPipeline2(X_train, X_test, y_train, y_test, input_df): for clf_name, clf in clfs: #%% - # pipeline = Pipeline(steps=[ - # ('scaler', MinMaxScaler()), - # #('scaler', StandardScaler()), - # ('classifier', clf) - # ] - # ) # define the data preparation for the columns t = [('cat', OneHotEncoder(), categorical_ix) , ('num', MinMaxScaler(), numerical_ix)] diff --git a/MultClassPipe3.py b/MultClassPipe3.py index b5570ae..4dfdc5b 100644 --- a/MultClassPipe3.py +++ b/MultClassPipe3.py @@ -6,51 +6,79 @@ Created on Fri Mar 4 15:25:33 2022 @author: tanu """ #%% + import os, sys import pandas as pd import numpy as np -from sklearn.linear_model import LogisticRegression +import pprint as pp +#from copy import deepcopy +from sklearn import linear_model +from sklearn.linear_model import LogisticRegression, LinearRegression from sklearn.naive_bayes import BernoulliNB from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier from sklearn.neural_network import MLPClassifier -from sklearn.pipeline import Pipeline from xgboost import XGBClassifier +from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder + from sklearn.compose import ColumnTransformer -from sklearn.preprocessing import StandardScaler -from sklearn.preprocessing import MinMaxScaler, OneHotEncoder +from sklearn.compose import make_column_transformer + +from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score +from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef +from sklearn.metrics import make_scorer +from sklearn.metrics import classification_report + +from sklearn.metrics import average_precision_score from sklearn.model_selection import cross_validate from sklearn.model_selection import train_test_split from sklearn.model_selection import StratifiedKFold -from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score -from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef +from sklearn.pipeline import Pipeline +from sklearn.pipeline import make_pipeline + +from sklearn.feature_selection import RFE +from sklearn.feature_selection import RFECV +import itertools +import seaborn as sns +import matplotlib.pyplot as plt +import numpy as np +print(np.__version__) +print(pd.__version__) from statistics import mean, stdev, median, mode + +from imblearn.over_sampling import RandomOverSampler +from imblearn.over_sampling import SMOTE +from imblearn.pipeline import Pipeline +#from sklearn.datasets import make_classification +from sklearn.model_selection import cross_validate +from sklearn.model_selection import RepeatedStratifiedKFold +from sklearn.ensemble import AdaBoostClassifier +from imblearn.combine import SMOTEENN +from imblearn.under_sampling import EditedNearestNeighbours + #%% rs = {'random_state': 42} # Done: add preprocessing step with one hot encoder -# TODO: supply stratified K-fold cv train and test data -# TODO: get accuracy and other scores through K-fold cv +# Done: get accuracy and other scores through K-fold stratified cv + +scoring_fn = ({ 'fscore' : make_scorer(f1_score) + , 'mcc' : make_scorer(matthews_corrcoef) + , 'precision' : make_scorer(precision_score) + , 'recall' : make_scorer(recall_score) + , 'accuracy' : make_scorer(accuracy_score) + , 'roc_auc' : make_scorer(roc_auc_score) + #, 'jaccard' : make_scorer(jaccard_score) + }) + # Multiple Classification - Model Pipeline -def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical','mixed'], skf_splits = 10): +def MultClassPipelineCV(X_train, X_test, y_train, y_test, input_df, var_type = ['numerical', 'categorical','mixed']): - ''' - @ param input_df: input features - @ type: df (gets converted to np.array for stratified Kfold, and helps identify names to apply column transformation) - - @param y_outputF: target (or output) feature - @type: df or np.array - - - returns - multiple classification model scores - - ''' - # Determine categorical and numerical features + # determine categorical and numerical features numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns numerical_ix categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns @@ -69,129 +97,67 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical' col_transform = ColumnTransformer(transformers = t , remainder='passthrough') - -#%% Define classification models to run + + #%% log_reg = LogisticRegression(**rs) - nb = BernoulliNB() - knn = KNeighborsClassifier() - svm = SVC(**rs) - mlp = MLPClassifier(max_iter = 500, **rs) - dt = DecisionTreeClassifier(**rs) - et = ExtraTreesClassifier(**rs) - rf = RandomForestClassifier(**rs) - rf2 = RandomForestClassifier( - min_samples_leaf = 50, - n_estimators = 150, - bootstrap = True, - oob_score = True, - n_jobs = -1, - random_state = 42, - max_features = 'auto') + nb = BernoulliNB() + knn = KNeighborsClassifier() + svm = SVC(**rs) + mlp = MLPClassifier(max_iter=500, **rs) + dt = DecisionTreeClassifier(**rs) + et = ExtraTreesClassifier(**rs) + rf = RandomForestClassifier(**rs) + rf2 = RandomForestClassifier( + min_samples_leaf=50, + n_estimators=150, + bootstrap=True, + oob_score=True, + n_jobs=-1, + random_state=42, + max_features='auto') - xgb = XGBClassifier(**rs, verbosity = 0) + xgb = XGBClassifier(**rs, verbosity=0) - clfs = [ - ('Logistic Regression' , log_reg) - #, ('Naive Bayes' , nb) - , ('K-Nearest Neighbors', knn) - , ('SVM' , svm) - , ('MLP' , mlp) - , ('Decision Tree' , dt) - , ('Extra Trees' , et) - , ('Random Forest' , rf) - , ('Naive Bayes' , nb) - - #, ('Random Forest2' , rf2) - #, ('XGBoost' , xgb) + models = [ + ('Logistic Regression', log_reg), + ('Naive Bayes', nb), + ('K-Nearest Neighbors', knn), + ('SVM', svm), + ('MLP', mlp), + ('Decision Tree', dt), + ('Extra Trees', et), + ('Random Forest', rf), + ('Random Forest2', rf2), + #('XGBoost', xgb) ] - - skf = StratifiedKFold(n_splits = skf_splits - , shuffle = True - #, random_state = seed_skf - , **rs) - - X_array = np.array(input_df) - Y = y_targetF - - # Initialise score metrics list to store skf results - # fscoreL = [] - # mccL = [] - # presL = [] - # recallL = [] - # accuL = [] - # roc_aucL = [] - skf_dict = {} + + skf_cv_scores = {} + + for model_name, model_fn in models: + print('\nModel_name:', model_name + , '\nModel func:' , model_fn + , '\nList of models:', models) - #scores_df = pd.DataFrame() - for train_index, test_index in skf.split(input_df, y_targetF): - x_train_fold, x_test_fold = input_df.iloc[train_index], input_df.iloc[test_index] - y_train_fold, y_test_fold = y_targetF.iloc[train_index], y_targetF.iloc[test_index] - #fscoreL = {} + # model_pipeline = Pipeline([ + # ('pre' , MinMaxScaler()) + # , ('model' , model_fn)]) + + model_pipeline = Pipeline([ + ('prep' , col_transform) + , ('model' , model_fn)]) + + print('Running model pipeline:', model_pipeline) + skf_cv = cross_validate(model_pipeline + , X_train + , y_train + , cv = 10 + , scoring = scoring_fn + , return_train_score = True) + skf_cv_scores[model_name] = {} + for key, value in skf_cv.items(): + print('\nkey:', key, '\nvalue:', value) + print('\nmean value:', mean(value)) + skf_cv_scores[model_name][key] = round(mean(value),2) + #pp.pprint(skf_cv_scores) + return(skf_cv_scores) - # for train_index, test_index in skf.split(X_array, Y): - # print('\nSKF train index:', train_index - # , '\nSKF test index:', test_index) - # x_train_fold, x_test_fold = X_array[train_index], X_array[test_index] - # y_train_fold, y_test_fold = Y[train_index], Y[test_index] - - clf_scores_df = pd.DataFrame() - - for clf_name, clf in clfs: - print('\nRunning the following classification models' - , clf_name) - - model_pipeline = Pipeline(steps=[('prep' , col_transform) - , ('classifier' , clf)]) - - # model_pipeline = Pipeline(steps=[('prep' , MinMaxScaler()) - # , ('classifier' , clf)]) - - - model_pipeline.fit(x_train_fold, y_train_fold) - y_pred_fold = model_pipeline.predict(x_test_fold) - - #---------------- - # Model metrics - #---------------- - # F1-Score - fscore = f1_score(y_test_fold, y_pred_fold) - fscoreL[clf_name].append(fscore) - print('fscoreL Len: ', len(fscoreL)) - #fscoreM = mean(fscoreL[clf]) - - # Matthews correlation coefficient - mcc = matthews_corrcoef(y_test_fold, y_pred_fold) - mccL[clf_name].append(mcc) - mccM = mean(mccL) - - # # Precision - # pres = precision_score(y_test_fold, y_pred_fold) - # presL.append(pres) - # presM = mean(presL) - - # # Recall - # recall = recall_score(y_test_fold, y_pred_fold) - # recallL.append(recall) - # recallM = mean(recallL) - - # # Accuracy - # accu = accuracy_score(y_test_fold, y_pred_fold) - # accuL.append(accu) - # accuM = mean(accuL) - - # # ROC_AUC - # roc_auc = roc_auc_score(y_test_fold, y_pred_fold) - # roc_aucL.append(roc_auc) - # roc_aucM = mean(roc_aucL) - - clf_scores_df = clf_scores_df.append({'Model' : clf_name - ,'F1_score' : fscoreM - , 'MCC' : mccM - , 'Precision': presM - , 'Recall' : recallM - , 'Accuracy' : accuM - , 'ROC_curve': roc_aucM} - , ignore_index = True) - return(clf_scores_df) - #scores_df = scores_df.append(clf_scores_df) -# return clf_scores_df \ No newline at end of file diff --git a/__pycache__/MultClassPipe2.cpython-37.pyc b/__pycache__/MultClassPipe2.cpython-37.pyc index 1cb8e8b..5812752 100644 Binary files a/__pycache__/MultClassPipe2.cpython-37.pyc and b/__pycache__/MultClassPipe2.cpython-37.pyc differ diff --git a/__pycache__/MultClassPipe3.cpython-37.pyc b/__pycache__/MultClassPipe3.cpython-37.pyc index 2777a29..f6693e8 100644 Binary files a/__pycache__/MultClassPipe3.cpython-37.pyc and b/__pycache__/MultClassPipe3.cpython-37.pyc differ diff --git a/__pycache__/loopity_loop.cpython-37.pyc b/__pycache__/loopity_loop.cpython-37.pyc index effcb8e..5439565 100644 Binary files a/__pycache__/loopity_loop.cpython-37.pyc and b/__pycache__/loopity_loop.cpython-37.pyc differ diff --git a/imports.py b/imports.py index a4c029e..928f59e 100644 --- a/imports.py +++ b/imports.py @@ -8,6 +8,7 @@ Created on Sun Mar 6 13:41:54 2022 import os, sys import pandas as pd import numpy as np +import pprint as pp #from copy import deepcopy from sklearn import linear_model from sklearn.linear_model import LogisticRegression, LinearRegression @@ -64,6 +65,8 @@ os.chdir(homedir + "/git/ML_AI_training/") from MultClassPipe import MultClassPipeline from MultClassPipe2 import MultClassPipeline2 from loopity_loop import MultClassPipeSKF +from MultClassPipe3 import MultClassPipelineCV + gene = 'pncA' drug = 'pyrazinamide' diff --git a/loopity_loop.py b/loopity_loop.py index 17fd851..b4f00e7 100644 --- a/loopity_loop.py +++ b/loopity_loop.py @@ -82,13 +82,13 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical' et = ExtraTreesClassifier(**rs) rf = RandomForestClassifier(**rs) rf2 = RandomForestClassifier( - min_samples_leaf = 50, - n_estimators = 150, - bootstrap = True, - oob_score = True, - n_jobs = -1, - random_state = 42, - max_features = 'auto') + min_samples_leaf = 50 + , n_estimators = 150 + , bootstrap = True + , oob_score = True + , n_jobs = -1 + , **rs + , max_features = 'auto') xgb = XGBClassifier(**rs, verbosity = 0) classification_metrics = { @@ -97,20 +97,20 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical' ,'Precision': [] ,'Recall': [] ,'Accuracy': [] - #,'ROC_AUC': [] + ,'ROC_AUC': [] } models = [ ('Logistic Regression' , log_reg) , ('Naive Bayes' , nb) , ('K-Nearest Neighbors', knn) , ('SVM' , svm) - # , ('MLP' , mlp) - # , ('Decision Tree' , dt) - # , ('Extra Trees' , et) - # , ('Random Forest' , rf) - # , ('Naive Bayes' , nb) + , ('MLP' , mlp) + , ('Decision Tree' , dt) + , ('Extra Trees' , et) + , ('Random Forest' , rf) + , ('Naive Bayes' , nb) - #, ('Random Forest2' , rf2) + , ('Random Forest2' , rf2) #, ('XGBoost' , xgb) ] @@ -118,7 +118,7 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical' , shuffle = True , **rs) - skf_dict = {} +# skf_dict = {} fold_no = 1 fold_dict={} @@ -145,12 +145,12 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical' #---------------- fscore = f1_score(y_test_fold, y_pred_fold) mcc = matthews_corrcoef(y_test_fold, y_pred_fold) - #pres = precision_score(y_test_fold, y_pred_fold) - #recall = recall_score(y_test_fold, y_pred_fold) - pres = precision_score(y_test_fold, y_pred_fold, zero_division=0) - recall = recall_score(y_test_fold, y_pred_fold, zero_division=0) + pres = precision_score(y_test_fold, y_pred_fold) + recall = recall_score(y_test_fold, y_pred_fold) + #pres = precision_score(y_test_fold, y_pred_fold, zero_division=0) + #recall = recall_score(y_test_fold, y_pred_fold, zero_division=0) accu = accuracy_score(y_test_fold, y_pred_fold) - #roc_auc = roc_auc_score(y_test_fold, y_pred_fold) + roc_auc = roc_auc_score(y_test_fold, y_pred_fold) fold=("fold_"+str(fold_no)) @@ -165,7 +165,7 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical' fold_dict[model_name][fold].update({'Precision' : pres}) fold_dict[model_name][fold].update({'Recall' : recall}) fold_dict[model_name][fold].update({'Accuracy' : accu}) - #fold_dict[model_name][fold].update({'ROC_AUC' : roc_auc}) + fold_dict[model_name][fold].update({'ROC_AUC' : roc_auc}) fold_no +=1 #pp.pprint(skf_dict) diff --git a/loopity_loop_CALL.py b/loopity_loop_CALL.py index 5f8833a..00e33b1 100644 --- a/loopity_loop_CALL.py +++ b/loopity_loop_CALL.py @@ -7,55 +7,32 @@ Created on Fri Mar 11 11:15:50 2022 """ #%% del(t3_res) -t3_res = MultClassPipeSKF(input_df = numerical_features_df - , y_targetF = target1 +# t3_res = MultClassPipeSKF(input_df = numerical_features_df +# , y_targetF = target1 +# , var_type = 'numerical' +# , skf_splits = 10) +# pp.pprint(t3_res) +# #print(t3_res) + +t3_res = MultClassPipeSKF(input_df = num_df_wtgt[numerical_FN] + , y_targetF = num_df_wtgt['mutation_class'] , var_type = 'numerical' , skf_splits = 10) pp.pprint(t3_res) #print(t3_res) -#%% Manually: mean for each model, each metric -model_name = 'Logistic Regression' -model_name = 'Naive Bayes' -model_name = 'K-Nearest Neighbors' -model_name = 'SVM' -#%% -model_metric = 'F1_score' - -log_reg_f1 = [] -for key in t3_res[model_name]: - log_reg_f1.append(t3_res[model_name][key][model_metric]) - log_reg_f1M = mean(log_reg_f1) - print('key:', key, model_metric, ':', log_reg_f1) -print(log_reg_f1M) - -log_reg_f1df = pd.DataFrame({model_name: [log_reg_f1M]}, index = [model_metric]) -log_reg_f1df - -#%% -model_metric = 'MCC' -log_reg_mcc = [] -for key in t3_res[model_name]: - log_reg_mcc.append(t3_res[model_name][key][model_metric]) - log_reg_mccM = mean(log_reg_mcc) - print('key:', key, model_metric, ':', log_reg_mcc) -print(log_reg_mccM) - -log_reg_mccdf = pd.DataFrame({model_name: [log_reg_mccM]}, index = [model_metric]) -log_reg_mccdf -#%% ################################################################ # extract items from wwithin a nested dict #%% Classification Metrics we need to mean() -classification_metrics = { - 'F1_score': [] - ,'MCC': [] - ,'Precision': [] - ,'Recall': [] - ,'Accuracy': [] - } +# classification_metrics = { +# 'F1_score': [] +# ,'MCC': [] +# ,'Precision': [] +# ,'Recall': [] +# ,'Accuracy': [] +# ,'ROC_AUC':[] +# } # "mean() of the current metric across all folds for this model" - # the output containing all the metrics across all folds for this model out={} # Just the mean() for each of the above metrics-per-model @@ -64,16 +41,16 @@ out_means={} # Build up out{} from t3_res, which came from loopity_loop for model in t3_res: # NOTE: can't copy objects in Python!!! - out[model]={'F1_score': [], 'MCC': [], 'Precision': [], 'Recall': [], 'Accuracy': []} + out[model]={'F1_score': [], 'MCC': [], 'Precision': [], 'Recall': [], 'Accuracy': [], 'ROC_AUC':[]} out_means[model]={} # just to make life easier print(model) for fold in t3_res[model]: - for metric in {'F1_score': [], 'MCC': [], 'Precision': [], 'Recall': [], 'Accuracy': []}: + for metric in {'F1_score': [], 'MCC': [], 'Precision': [], 'Recall': [], 'Accuracy': [], 'ROC_AUC':[]}: metric_value = t3_res[model][fold][metric] out[model][metric].append(metric_value) # now that we've built out{}, let's mean() each metric for model in out: - for metric in {'F1_score': [], 'MCC': [], 'Precision': [], 'Recall': [], 'Accuracy': []}: + for metric in {'F1_score': [], 'MCC': [], 'Precision': [], 'Recall': [], 'Accuracy': [], 'ROC_AUC':[]}: metric_mean = mean(out[model][metric]) # just some debug output # print('model:', model @@ -84,3 +61,4 @@ for model in out: out_means[model].update({(metric+'_mean'): metric_mean }) out_scores = pd.DataFrame(out_means) +out_scores2 = round(out_scores, 2)