diff --git a/UQ_yc_RunAllClfs.py b/UQ_yc_RunAllClfs.py new file mode 100644 index 0000000..69d01c2 --- /dev/null +++ b/UQ_yc_RunAllClfs.py @@ -0,0 +1,350 @@ +import pandas as pd +import numpy as np +import scipy as sp +import time +import sys +import os +import re +import argparse +from math import sqrt +from scipy import stats +import joblib +# Alogorithm +from xgboost.sklearn import XGBClassifier +from sklearn import svm +from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier +from sklearn.gaussian_process import GaussianProcessClassifier +from sklearn.ensemble import AdaBoostClassifier +from sklearn.neighbors import KNeighborsClassifier +from sklearn.neural_network import MLPRegressor +from sklearn.utils import all_estimators +# Pre-processing +from sklearn import preprocessing +from sklearn.preprocessing import StandardScaler +from sklearn.datasets import make_classification +from sklearn.pipeline import Pipeline, make_pipeline +from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, LeaveOneOut, KFold, RepeatedKFold, cross_val_predict +# Metric +from sklearn.metrics import mean_squared_error, make_scorer, roc_auc_score, f1_score, matthews_corrcoef, accuracy_score, balanced_accuracy_score, confusion_matrix, classification_report +############################################################################### +# TT imports +from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score +from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report +from sklearn.model_selection import train_test_split, cross_validate, cross_val_score +from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold +from copy import deepcopy +from sklearn import linear_model +from sklearn import datasets +from collections import Counter + +from sklearn.linear_model import LogisticRegression, LogisticRegressionCV +from sklearn.linear_model import RidgeClassifier, RidgeClassifierCV, SGDClassifier, PassiveAggressiveClassifier + +from sklearn.naive_bayes import BernoulliNB +from sklearn.neighbors import KNeighborsClassifier +from sklearn.svm import SVC +from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier +from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier +from sklearn.naive_bayes import GaussianNB +from sklearn.gaussian_process import GaussianProcessClassifier, kernels +from sklearn.gaussian_process.kernels import RBF, DotProduct, Matern, RationalQuadratic, WhiteKernel + +from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis +from sklearn.neural_network import MLPClassifier + +from sklearn.svm import SVC +from xgboost import XGBClassifier +from sklearn.naive_bayes import MultinomialNB +from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder + +from sklearn.compose import ColumnTransformer +from sklearn.compose import make_column_transformer + +from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score +from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report + +from sklearn.model_selection import train_test_split, cross_validate, cross_val_score +from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold + +from sklearn.pipeline import Pipeline, make_pipeline + +from sklearn.feature_selection import RFE, RFECV + +import itertools +import seaborn as sns +import matplotlib.pyplot as plt + +from statistics import mean, stdev, median, mode + +from imblearn.over_sampling import RandomOverSampler +from imblearn.under_sampling import RandomUnderSampler +from imblearn.over_sampling import SMOTE +from sklearn.datasets import make_classification +from imblearn.combine import SMOTEENN +from imblearn.combine import SMOTETomek + +from imblearn.over_sampling import SMOTENC +from imblearn.under_sampling import EditedNearestNeighbours +from imblearn.under_sampling import RepeatedEditedNearestNeighbours + +from sklearn.model_selection import GridSearchCV +from sklearn.base import BaseEstimator +from sklearn.impute import KNNImputer as KNN +import json +############################################################################## + +# other vars +rs = {'random_state': 42} +njobs = {'n_jobs': 10} + +scoring_fn = ({'accuracy' : make_scorer(accuracy_score) + , 'fscore' : make_scorer(f1_score) + , 'mcc' : make_scorer(matthews_corrcoef) + , 'precision' : make_scorer(precision_score) + , 'recall' : make_scorer(recall_score) + , 'roc_auc' : make_scorer(roc_auc_score) + , 'jcc' : make_scorer(jaccard_score) + }) + +skf_cv = StratifiedKFold(n_splits = 10 + #, shuffle = False, random_state= None) + , shuffle = True,**rs) + +rskf_cv = RepeatedStratifiedKFold(n_splits = 10 + , n_repeats = 3 + , **rs) + +mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)} +jacc_score_fn = {'jcc': make_scorer(jaccard_score)} +#%% YC +#def run_all_ML(input_pd, target_label, bts_input, bts_target, var_type): +def run_all_ML(input_pd, target_label, blind_test_input_df, blind_test_target, preprocess = True, var_type = 'numerical'): + + #y = input_pd[target_label] + #X = input_pd.drop(target_label,axis=1) + y = target_label + X = input_pd + + # Determine categorical and numerical features + numerical_ix = input_pd.select_dtypes(include=['int64', 'float64']).columns + numerical_ix + categorical_ix = input_pd.select_dtypes(include=['object', 'bool']).columns + categorical_ix + + # Determine preprocessing steps ~ var_type + if var_type == 'numerical': + t = [('num', MinMaxScaler(), numerical_ix)] + + if var_type == 'categorical': + t = [('cat', OneHotEncoder(), categorical_ix)] + + if var_type == 'mixed': + t = [('num', MinMaxScaler(), numerical_ix) + , ('cat', OneHotEncoder(), categorical_ix)] + + col_transform = ColumnTransformer(transformers = t + , remainder='passthrough') + result_pd = pd.DataFrame() + result_bts_pd = pd.DataFrame() + #results_btsD = {} + results_all = {} + + for name, algorithm in all_estimators(type_filter="classifier"): + try: + estmator = algorithm() + temp_pd = pd.DataFrame() + temp_cm = pd.DataFrame() + + # # orig + # pipe = Pipeline([ + # ("model" , algorithm()) + # ]) + + # turn on and off preprocessing + if preprocess == True: + pipe = Pipeline([ + ('prep' , col_transform), + ("model" , algorithm()) + ]) + else: + pipe = Pipeline([ + ("model" , algorithm()) + ]) + + # cross val scores + y_pred = cross_val_predict(pipe, X, y, cv = 10, **njobs) +# CHANGE to cross_validate: ONLY THEN CAN YOU TRUST + # y_pred = cross_validate(pipe, X, y + # , cv = 10 + # , scoring = scoring_fn + # , **njobs) + + _mcc = round(matthews_corrcoef(y_pred, y), 3) + _bacc = round(balanced_accuracy_score(y_pred, y), 3) + _f1 = round(f1_score(y_pred, y), 3) + _roc_auc = round(roc_auc_score(y_pred, y), 3) + _tn, _fp, _fn, _tp = confusion_matrix(y_pred, y).ravel() + + # result_pd = result_pd.append(pd.DataFrame(np.column_stack([name + # , _tp, _tn + # , _fp , _fn + # , _roc_auc + # , _mcc + # , _bacc, _f1]),\ + # columns=['estimator', 'TP', 'TN', 'FP', 'FN', + # 'roc_auc', 'matthew', 'bacc', 'f1']),\ + # ignore_index=True) + + result_pd = result_pd.append(pd.DataFrame(np.column_stack([name + , _mcc + , _roc_auc + , _bacc, _f1 + , _tp, _tn + , _fp , _fn]),\ + columns=['estimator', 'matthew', 'roc_auc', 'bacc', 'f1',\ + 'TP', 'TN', 'FP', 'FN']),\ + ignore_index=True) + #========================= + # Blind test: BTS results + #========================= + #Build the final results with all scores for a feature selected model + pipe.fit(input_pd, target_label) + bts_predict = pipe.predict(blind_test_input_df) + + bts_mcc_score = round(matthews_corrcoef(blind_test_target, bts_predict),2) + print('\nMCC on Blind test:' , bts_mcc_score) + #print('\nAccuracy on Blind test:', round(accuracy_score(blind_test_target, bts_predict),2)) + + _mccBTS = round(matthews_corrcoef(bts_predict, blind_test_target), 3) + _baccBTS = round(balanced_accuracy_score(bts_predict, blind_test_target), 3) + _f1BTS = round(f1_score(bts_predict, blind_test_target), 3) + _roc_aucBTS = round(roc_auc_score(bts_predict, blind_test_target), 3) + _tnBTS, _fpBTS, _fnBTS, _tpBTS = confusion_matrix(bts_predict, blind_test_target).ravel() + + # result_bts_pd = result_bts_pd.append(pd.DataFrame(np.column_stack([name + # , _tpBTS, _tnBTS + # , _fpBTS, _fnBTS + # , _roc_aucBTS + # , _mccBTS + # , _baccBTS, _f1BTS]),\ + # columns=['estimator', 'TP', 'TN', 'FP', 'FN', + # 'roc_auc', 'matthew', 'bacc', 'f1']),\ + # ignore_index=True) + result_bts_pd = result_bts_pd.append(pd.DataFrame(np.column_stack([name + , _mccBTS + , _roc_aucBTS + , _baccBTS, _f1BTS + , _tpBTS, _tnBTS + , _fpBTS, _fnBTS]),\ + columns=['estimator','matthew', 'roc_auc', 'bacc', 'f1',\ + 'TP', 'TN', 'FP', 'FN']),\ + ignore_index=True) + + + results_all['CrossValResultsDF'] = result_pd + results_all['BlindTestResultsDF'] = result_bts_pd + + except Exception as e: + print("XXXGot an error while running {}".format(name)) + print(e) + + + #return(result_pd) + return(results_all) + + +#%% CALL function +#run_all_ML(input_pd=X, target_label=y, blind_test_input_df=X_bts, blind_test_target=y_bts, preprocess = True, var_type = 'mixed') +# Baseline_data + +YC_resD2 = run_all_ML(input_pd=X, target_label=y, blind_test_input_df=X_bts, blind_test_target=y_bts, preprocess = True, var_type = 'mixed') +CVResultsDF_baseline = YC_resD2['CrossValResultsDF'] +CVResultsDF_baseline.sort_values(by=['matthew'], ascending=False, inplace=True) +BTSResultsDF_baseline = YC_resD2['BlindTestResultsDF'] +BTSResultsDF_baseline.sort_values(by=['matthew'], ascending=False, inplace=True) + +# from sklearn.utils import all_estimators +# for name, algorithm in all_estimators(type_filter="classifier"): +# clf = algorithm() +# print('Name:', name, '\nAlgo:', clf) + +# Random Oversampling +YC_resD_ros = run_all_ML(input_pd=X_ros, target_label=y_ros, blind_test_input_df=X_bts, blind_test_target=y_bts, preprocess = True, var_type = 'mixed') +CVResultsDF_ros = YC_resD_ros['CrossValResultsDF'] +CVResultsDF_ros.sort_values(by=['matthew'], ascending=False, inplace=True) +BTSResultsDF_ros = YC_resD_ros['BlindTestResultsDF'] +BTSResultsDF_ros.sort_values(by=['matthew'], ascending=False, inplace=True) + +# Random Undersampling +YC_resD_rus = run_all_ML(input_pd=X_rus, target_label=y_rus, blind_test_input_df=X_bts, blind_test_target=y_bts, preprocess = True, var_type = 'mixed') +CVResultsDF_rus = YC_resD_rus['CrossValResultsDF'] +CVResultsDF_rus.sort_values(by=['matthew'], ascending=False, inplace=True) +BTSResultsDF_rus = YC_resD_rus['BlindTestResultsDF'] +BTSResultsDF_rus.sort_values(by=['matthew'], ascending=False, inplace=True) + +# Random Oversampling+Undersampling +YC_resD_rouC = run_all_ML(input_pd=X_rouC, target_label=y_rouC, blind_test_input_df=X_bts, blind_test_target=y_bts, preprocess = True, var_type = 'mixed') +CVResultsDF_rouC = YC_resD_rouC['CrossValResultsDF'] +CVResultsDF_rouC.sort_values(by=['matthew'], ascending=False, inplace=True) +BTSResultsDF_rouC = YC_resD_rouC['BlindTestResultsDF'] +BTSResultsDF_rouC.sort_values(by=['matthew'], ascending=False, inplace=True) + +# SMOTE NC +YC_resD_smnc = run_all_ML(input_pd=X_smnc, target_label=y_smnc, blind_test_input_df=X_bts, blind_test_target=y_bts, preprocess = True, var_type = 'mixed') +CVResultsDF_smnc = YC_resD_smnc['CrossValResultsDF'] +CVResultsDF_smnc.sort_values(by=['matthew'], ascending=False, inplace=True) +BTSResultsDF_smnc = YC_resD_smnc['BlindTestResultsDF'] +BTSResultsDF_smnc.sort_values(by=['matthew'], ascending=False, inplace=True) +############################################################################## +#============================================ +# BASELINE models with dissected featues +#============================================ +# Genomics +yC_gf = run_all_ML(input_pd=X[X_genomicFN], target_label=y, blind_test_input_df=X_bts[X_genomicFN], blind_test_target=y_bts, preprocess = True, var_type = 'mixed') +yc_gfCT_baseline= yC_gf['CrossValResultsDF'] +yc_gfCT_baseline.sort_values(by=['matthew'], ascending=False, inplace=True) +yc_gfBT_baseline = yC_gf['BlindTestResultsDF'] +yc_gfBT_baseline.sort_values(by=['matthew'], ascending=False, inplace=True) + +# Evolutionary +yC_ev = run_all_ML(input_pd=X[X_evolFN], target_label=y, blind_test_input_df=X_bts[X_evolFN], blind_test_target=y_bts, preprocess = True, var_type = 'mixed') +yc_evCT_baseline= yC_ev['CrossValResultsDF'] +yc_evCT_baseline.sort_values(by=['matthew'], ascending=False, inplace=True) +yc_evBT_baseline = yC_ev['BlindTestResultsDF'] +yc_evBT_baseline.sort_values(by=['matthew'], ascending=False, inplace=True) + +# strucF:All +yC_sfall = run_all_ML(input_pd=X[X_strFN], target_label=y, blind_test_input_df=X_bts[X_strFN], blind_test_target=y_bts, preprocess = True, var_type = 'mixed') +yc_sfallCT_baseline= yC_sfall['CrossValResultsDF'] +yc_sfallCT_baseline.sort_values(by=['matthew'], ascending=False, inplace=True) +yc_sfallBT_baseline = yC_sfall['BlindTestResultsDF'] +yc_sfallBT_baseline.sort_values(by=['matthew'], ascending=False, inplace=True) + +# strucF:Common ONLY +yC_sfco= run_all_ML(input_pd=X[common_cols_stabiltyN], target_label=y + , blind_test_input_df=X_bts[common_cols_stabiltyN] + , blind_test_target=y_bts, preprocess = True, var_type = 'mixed') +yc_sfcoCT_baseline= yC_sfco['CrossValResultsDF'] +yc_sfcoCT_baseline.sort_values(by=['matthew'], ascending=False, inplace=True) +yc_sfcoBT_baseline = yC_sfco['BlindTestResultsDF'] +yc_sfcoBT_baseline.sort_values(by=['matthew'], ascending=False, inplace=True) + +# strucF:common_stability + foldX_cols i.e interaction +yC_fxss= run_all_ML(input_pd=X[common_cols_stabiltyN+foldX_cols], target_label=y + , blind_test_input_df=X_bts[common_cols_stabiltyN+foldX_cols] + , blind_test_target=y_bts, preprocess = True, var_type = 'mixed') +yc_fxssCT_baseline= yC_fxss['CrossValResultsDF'] +yc_fxssCT_baseline.sort_values(by=['matthew'], ascending=False, inplace=True) +yc_fxssBT_baseline = yC_fxss['BlindTestResultsDF'] +yc_fxssBT_baseline.sort_values(by=['matthew'], ascending=False, inplace=True) + +# categorical +yC_cat= run_all_ML(input_pd=X[categorical_FN], target_label=y + , blind_test_input_df=X_bts[categorical_FN] + , blind_test_target=y_bts, preprocess = True, var_type = 'mixed') +yc_catCT_baseline= yC_cat['CrossValResultsDF'] +yc_catCT_baseline.sort_values(by=['matthew'], ascending=False, inplace=True) +yc_catBT_baseline = yC_cat['BlindTestResultsDF'] +yc_catBT_baseline.sort_values(by=['matthew'], ascending=False, inplace=True) + +