diff --git a/uq_ml_models_FS/UQ_yc_runallcls.py b/uq_ml_models_FS/UQ_yc_runallcls.py deleted file mode 100644 index 69d01c2..0000000 --- a/uq_ml_models_FS/UQ_yc_runallcls.py +++ /dev/null @@ -1,350 +0,0 @@ -import pandas as pd -import numpy as np -import scipy as sp -import time -import sys -import os -import re -import argparse -from math import sqrt -from scipy import stats -import joblib -# Alogorithm -from xgboost.sklearn import XGBClassifier -from sklearn import svm -from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier -from sklearn.gaussian_process import GaussianProcessClassifier -from sklearn.ensemble import AdaBoostClassifier -from sklearn.neighbors import KNeighborsClassifier -from sklearn.neural_network import MLPRegressor -from sklearn.utils import all_estimators -# Pre-processing -from sklearn import preprocessing -from sklearn.preprocessing import StandardScaler -from sklearn.datasets import make_classification -from sklearn.pipeline import Pipeline, make_pipeline -from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, LeaveOneOut, KFold, RepeatedKFold, cross_val_predict -# Metric -from sklearn.metrics import mean_squared_error, make_scorer, roc_auc_score, f1_score, matthews_corrcoef, accuracy_score, balanced_accuracy_score, confusion_matrix, classification_report -############################################################################### -# TT imports -from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score -from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report -from sklearn.model_selection import train_test_split, cross_validate, cross_val_score -from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold -from copy import deepcopy -from sklearn import linear_model -from sklearn import datasets -from collections import Counter - -from sklearn.linear_model import LogisticRegression, LogisticRegressionCV -from sklearn.linear_model import RidgeClassifier, RidgeClassifierCV, SGDClassifier, PassiveAggressiveClassifier - -from sklearn.naive_bayes import BernoulliNB -from sklearn.neighbors import KNeighborsClassifier -from sklearn.svm import SVC -from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier -from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier -from sklearn.naive_bayes import GaussianNB -from sklearn.gaussian_process import GaussianProcessClassifier, kernels -from sklearn.gaussian_process.kernels import RBF, DotProduct, Matern, RationalQuadratic, WhiteKernel - -from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis -from sklearn.neural_network import MLPClassifier - -from sklearn.svm import SVC -from xgboost import XGBClassifier -from sklearn.naive_bayes import MultinomialNB -from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder - -from sklearn.compose import ColumnTransformer -from sklearn.compose import make_column_transformer - -from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score -from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report - -from sklearn.model_selection import train_test_split, cross_validate, cross_val_score -from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold - -from sklearn.pipeline import Pipeline, make_pipeline - -from sklearn.feature_selection import RFE, RFECV - -import itertools -import seaborn as sns -import matplotlib.pyplot as plt - -from statistics import mean, stdev, median, mode - -from imblearn.over_sampling import RandomOverSampler -from imblearn.under_sampling import RandomUnderSampler -from imblearn.over_sampling import SMOTE -from sklearn.datasets import make_classification -from imblearn.combine import SMOTEENN -from imblearn.combine import SMOTETomek - -from imblearn.over_sampling import SMOTENC -from imblearn.under_sampling import EditedNearestNeighbours -from imblearn.under_sampling import RepeatedEditedNearestNeighbours - -from sklearn.model_selection import GridSearchCV -from sklearn.base import BaseEstimator -from sklearn.impute import KNNImputer as KNN -import json -############################################################################## - -# other vars -rs = {'random_state': 42} -njobs = {'n_jobs': 10} - -scoring_fn = ({'accuracy' : make_scorer(accuracy_score) - , 'fscore' : make_scorer(f1_score) - , 'mcc' : make_scorer(matthews_corrcoef) - , 'precision' : make_scorer(precision_score) - , 'recall' : make_scorer(recall_score) - , 'roc_auc' : make_scorer(roc_auc_score) - , 'jcc' : make_scorer(jaccard_score) - }) - -skf_cv = StratifiedKFold(n_splits = 10 - #, shuffle = False, random_state= None) - , shuffle = True,**rs) - -rskf_cv = RepeatedStratifiedKFold(n_splits = 10 - , n_repeats = 3 - , **rs) - -mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)} -jacc_score_fn = {'jcc': make_scorer(jaccard_score)} -#%% YC -#def run_all_ML(input_pd, target_label, bts_input, bts_target, var_type): -def run_all_ML(input_pd, target_label, blind_test_input_df, blind_test_target, preprocess = True, var_type = 'numerical'): - - #y = input_pd[target_label] - #X = input_pd.drop(target_label,axis=1) - y = target_label - X = input_pd - - # Determine categorical and numerical features - numerical_ix = input_pd.select_dtypes(include=['int64', 'float64']).columns - numerical_ix - categorical_ix = input_pd.select_dtypes(include=['object', 'bool']).columns - categorical_ix - - # Determine preprocessing steps ~ var_type - if var_type == 'numerical': - t = [('num', MinMaxScaler(), numerical_ix)] - - if var_type == 'categorical': - t = [('cat', OneHotEncoder(), categorical_ix)] - - if var_type == 'mixed': - t = [('num', MinMaxScaler(), numerical_ix) - , ('cat', OneHotEncoder(), categorical_ix)] - - col_transform = ColumnTransformer(transformers = t - , remainder='passthrough') - result_pd = pd.DataFrame() - result_bts_pd = pd.DataFrame() - #results_btsD = {} - results_all = {} - - for name, algorithm in all_estimators(type_filter="classifier"): - try: - estmator = algorithm() - temp_pd = pd.DataFrame() - temp_cm = pd.DataFrame() - - # # orig - # pipe = Pipeline([ - # ("model" , algorithm()) - # ]) - - # turn on and off preprocessing - if preprocess == True: - pipe = Pipeline([ - ('prep' , col_transform), - ("model" , algorithm()) - ]) - else: - pipe = Pipeline([ - ("model" , algorithm()) - ]) - - # cross val scores - y_pred = cross_val_predict(pipe, X, y, cv = 10, **njobs) -# CHANGE to cross_validate: ONLY THEN CAN YOU TRUST - # y_pred = cross_validate(pipe, X, y - # , cv = 10 - # , scoring = scoring_fn - # , **njobs) - - _mcc = round(matthews_corrcoef(y_pred, y), 3) - _bacc = round(balanced_accuracy_score(y_pred, y), 3) - _f1 = round(f1_score(y_pred, y), 3) - _roc_auc = round(roc_auc_score(y_pred, y), 3) - _tn, _fp, _fn, _tp = confusion_matrix(y_pred, y).ravel() - - # result_pd = result_pd.append(pd.DataFrame(np.column_stack([name - # , _tp, _tn - # , _fp , _fn - # , _roc_auc - # , _mcc - # , _bacc, _f1]),\ - # columns=['estimator', 'TP', 'TN', 'FP', 'FN', - # 'roc_auc', 'matthew', 'bacc', 'f1']),\ - # ignore_index=True) - - result_pd = result_pd.append(pd.DataFrame(np.column_stack([name - , _mcc - , _roc_auc - , _bacc, _f1 - , _tp, _tn - , _fp , _fn]),\ - columns=['estimator', 'matthew', 'roc_auc', 'bacc', 'f1',\ - 'TP', 'TN', 'FP', 'FN']),\ - ignore_index=True) - #========================= - # Blind test: BTS results - #========================= - #Build the final results with all scores for a feature selected model - pipe.fit(input_pd, target_label) - bts_predict = pipe.predict(blind_test_input_df) - - bts_mcc_score = round(matthews_corrcoef(blind_test_target, bts_predict),2) - print('\nMCC on Blind test:' , bts_mcc_score) - #print('\nAccuracy on Blind test:', round(accuracy_score(blind_test_target, bts_predict),2)) - - _mccBTS = round(matthews_corrcoef(bts_predict, blind_test_target), 3) - _baccBTS = round(balanced_accuracy_score(bts_predict, blind_test_target), 3) - _f1BTS = round(f1_score(bts_predict, blind_test_target), 3) - _roc_aucBTS = round(roc_auc_score(bts_predict, blind_test_target), 3) - _tnBTS, _fpBTS, _fnBTS, _tpBTS = confusion_matrix(bts_predict, blind_test_target).ravel() - - # result_bts_pd = result_bts_pd.append(pd.DataFrame(np.column_stack([name - # , _tpBTS, _tnBTS - # , _fpBTS, _fnBTS - # , _roc_aucBTS - # , _mccBTS - # , _baccBTS, _f1BTS]),\ - # columns=['estimator', 'TP', 'TN', 'FP', 'FN', - # 'roc_auc', 'matthew', 'bacc', 'f1']),\ - # ignore_index=True) - result_bts_pd = result_bts_pd.append(pd.DataFrame(np.column_stack([name - , _mccBTS - , _roc_aucBTS - , _baccBTS, _f1BTS - , _tpBTS, _tnBTS - , _fpBTS, _fnBTS]),\ - columns=['estimator','matthew', 'roc_auc', 'bacc', 'f1',\ - 'TP', 'TN', 'FP', 'FN']),\ - ignore_index=True) - - - results_all['CrossValResultsDF'] = result_pd - results_all['BlindTestResultsDF'] = result_bts_pd - - except Exception as e: - print("XXXGot an error while running {}".format(name)) - print(e) - - - #return(result_pd) - return(results_all) - - -#%% CALL function -#run_all_ML(input_pd=X, target_label=y, blind_test_input_df=X_bts, blind_test_target=y_bts, preprocess = True, var_type = 'mixed') -# Baseline_data - -YC_resD2 = run_all_ML(input_pd=X, target_label=y, blind_test_input_df=X_bts, blind_test_target=y_bts, preprocess = True, var_type = 'mixed') -CVResultsDF_baseline = YC_resD2['CrossValResultsDF'] -CVResultsDF_baseline.sort_values(by=['matthew'], ascending=False, inplace=True) -BTSResultsDF_baseline = YC_resD2['BlindTestResultsDF'] -BTSResultsDF_baseline.sort_values(by=['matthew'], ascending=False, inplace=True) - -# from sklearn.utils import all_estimators -# for name, algorithm in all_estimators(type_filter="classifier"): -# clf = algorithm() -# print('Name:', name, '\nAlgo:', clf) - -# Random Oversampling -YC_resD_ros = run_all_ML(input_pd=X_ros, target_label=y_ros, blind_test_input_df=X_bts, blind_test_target=y_bts, preprocess = True, var_type = 'mixed') -CVResultsDF_ros = YC_resD_ros['CrossValResultsDF'] -CVResultsDF_ros.sort_values(by=['matthew'], ascending=False, inplace=True) -BTSResultsDF_ros = YC_resD_ros['BlindTestResultsDF'] -BTSResultsDF_ros.sort_values(by=['matthew'], ascending=False, inplace=True) - -# Random Undersampling -YC_resD_rus = run_all_ML(input_pd=X_rus, target_label=y_rus, blind_test_input_df=X_bts, blind_test_target=y_bts, preprocess = True, var_type = 'mixed') -CVResultsDF_rus = YC_resD_rus['CrossValResultsDF'] -CVResultsDF_rus.sort_values(by=['matthew'], ascending=False, inplace=True) -BTSResultsDF_rus = YC_resD_rus['BlindTestResultsDF'] -BTSResultsDF_rus.sort_values(by=['matthew'], ascending=False, inplace=True) - -# Random Oversampling+Undersampling -YC_resD_rouC = run_all_ML(input_pd=X_rouC, target_label=y_rouC, blind_test_input_df=X_bts, blind_test_target=y_bts, preprocess = True, var_type = 'mixed') -CVResultsDF_rouC = YC_resD_rouC['CrossValResultsDF'] -CVResultsDF_rouC.sort_values(by=['matthew'], ascending=False, inplace=True) -BTSResultsDF_rouC = YC_resD_rouC['BlindTestResultsDF'] -BTSResultsDF_rouC.sort_values(by=['matthew'], ascending=False, inplace=True) - -# SMOTE NC -YC_resD_smnc = run_all_ML(input_pd=X_smnc, target_label=y_smnc, blind_test_input_df=X_bts, blind_test_target=y_bts, preprocess = True, var_type = 'mixed') -CVResultsDF_smnc = YC_resD_smnc['CrossValResultsDF'] -CVResultsDF_smnc.sort_values(by=['matthew'], ascending=False, inplace=True) -BTSResultsDF_smnc = YC_resD_smnc['BlindTestResultsDF'] -BTSResultsDF_smnc.sort_values(by=['matthew'], ascending=False, inplace=True) -############################################################################## -#============================================ -# BASELINE models with dissected featues -#============================================ -# Genomics -yC_gf = run_all_ML(input_pd=X[X_genomicFN], target_label=y, blind_test_input_df=X_bts[X_genomicFN], blind_test_target=y_bts, preprocess = True, var_type = 'mixed') -yc_gfCT_baseline= yC_gf['CrossValResultsDF'] -yc_gfCT_baseline.sort_values(by=['matthew'], ascending=False, inplace=True) -yc_gfBT_baseline = yC_gf['BlindTestResultsDF'] -yc_gfBT_baseline.sort_values(by=['matthew'], ascending=False, inplace=True) - -# Evolutionary -yC_ev = run_all_ML(input_pd=X[X_evolFN], target_label=y, blind_test_input_df=X_bts[X_evolFN], blind_test_target=y_bts, preprocess = True, var_type = 'mixed') -yc_evCT_baseline= yC_ev['CrossValResultsDF'] -yc_evCT_baseline.sort_values(by=['matthew'], ascending=False, inplace=True) -yc_evBT_baseline = yC_ev['BlindTestResultsDF'] -yc_evBT_baseline.sort_values(by=['matthew'], ascending=False, inplace=True) - -# strucF:All -yC_sfall = run_all_ML(input_pd=X[X_strFN], target_label=y, blind_test_input_df=X_bts[X_strFN], blind_test_target=y_bts, preprocess = True, var_type = 'mixed') -yc_sfallCT_baseline= yC_sfall['CrossValResultsDF'] -yc_sfallCT_baseline.sort_values(by=['matthew'], ascending=False, inplace=True) -yc_sfallBT_baseline = yC_sfall['BlindTestResultsDF'] -yc_sfallBT_baseline.sort_values(by=['matthew'], ascending=False, inplace=True) - -# strucF:Common ONLY -yC_sfco= run_all_ML(input_pd=X[common_cols_stabiltyN], target_label=y - , blind_test_input_df=X_bts[common_cols_stabiltyN] - , blind_test_target=y_bts, preprocess = True, var_type = 'mixed') -yc_sfcoCT_baseline= yC_sfco['CrossValResultsDF'] -yc_sfcoCT_baseline.sort_values(by=['matthew'], ascending=False, inplace=True) -yc_sfcoBT_baseline = yC_sfco['BlindTestResultsDF'] -yc_sfcoBT_baseline.sort_values(by=['matthew'], ascending=False, inplace=True) - -# strucF:common_stability + foldX_cols i.e interaction -yC_fxss= run_all_ML(input_pd=X[common_cols_stabiltyN+foldX_cols], target_label=y - , blind_test_input_df=X_bts[common_cols_stabiltyN+foldX_cols] - , blind_test_target=y_bts, preprocess = True, var_type = 'mixed') -yc_fxssCT_baseline= yC_fxss['CrossValResultsDF'] -yc_fxssCT_baseline.sort_values(by=['matthew'], ascending=False, inplace=True) -yc_fxssBT_baseline = yC_fxss['BlindTestResultsDF'] -yc_fxssBT_baseline.sort_values(by=['matthew'], ascending=False, inplace=True) - -# categorical -yC_cat= run_all_ML(input_pd=X[categorical_FN], target_label=y - , blind_test_input_df=X_bts[categorical_FN] - , blind_test_target=y_bts, preprocess = True, var_type = 'mixed') -yc_catCT_baseline= yC_cat['CrossValResultsDF'] -yc_catCT_baseline.sort_values(by=['matthew'], ascending=False, inplace=True) -yc_catBT_baseline = yC_cat['BlindTestResultsDF'] -yc_catBT_baseline.sort_values(by=['matthew'], ascending=False, inplace=True) - -