diff --git a/pnca_config.py b/pnca_config.py index fe0b35a..1d7a113 100755 --- a/pnca_config.py +++ b/pnca_config.py @@ -10,7 +10,7 @@ import os gene = 'pncA' drug = 'pyrazinamide' -total_mtblineage_u = 8 +#total_mtblineage_u = 8 homedir = os.path.expanduser("~") @@ -22,5 +22,29 @@ from UQ_ML_data import * # from YC run_all_ML: run locally from UQ_MultModelsCl import MultModelsCl +print('TESTING cmd:' + , '\nGene name:', gene + , '\nDrug name:', drug + , '\nTotal input features:', X.shape + , '\n', Counter(y)) -print('TESTING cmd:', Counter(y)) \ No newline at end of file +print('Strucutral features (n):' + , len(common_cols_stabiltyN) + len(foldX_cols) + len(X_strFN) + , '\nThese are:' + , '\nCommon stablity features:', common_cols_stabiltyN + , '\nFoldX columns:', foldX_cols + , '\nOther struc columns:', X_strFN + , '\n================================================================\n') + +print('Evolutionary features (n):' + , len(X_evolFN) + , '\nThese are:\n' + , X_evolFN + , '\n================================================================\n') + +print('Genomic features (n):' + , len(X_genomicFN) + , '\nThese are:\n' + , X_genomic_mafor, '\n' + , X_genomic_linegae + , '\n================================================================\n') diff --git a/uq_ml_models_FS/scriptfsycm.py b/uq_ml_models_FS/scriptfsycm.py index 7934fcb..1e2e578 100644 --- a/uq_ml_models_FS/scriptfsycm.py +++ b/uq_ml_models_FS/scriptfsycm.py @@ -26,6 +26,72 @@ from sklearn.pipeline import Pipeline, make_pipeline from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, LeaveOneOut, KFold, RepeatedKFold, cross_val_predict # Metric from sklearn.metrics import mean_squared_error, make_scorer, roc_auc_score, f1_score, matthews_corrcoef, accuracy_score, balanced_accuracy_score, confusion_matrix, classification_report +############################################################################### +# TT imports +from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score +from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report +from sklearn.model_selection import train_test_split, cross_validate, cross_val_score +from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold +from copy import deepcopy +from sklearn import linear_model +from sklearn import datasets +from collections import Counter + +from sklearn.linear_model import LogisticRegression, LogisticRegressionCV +from sklearn.linear_model import RidgeClassifier, RidgeClassifierCV, SGDClassifier, PassiveAggressiveClassifier + +from sklearn.naive_bayes import BernoulliNB +from sklearn.neighbors import KNeighborsClassifier +from sklearn.svm import SVC +from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier +from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier +from sklearn.naive_bayes import GaussianNB +from sklearn.gaussian_process import GaussianProcessClassifier, kernels +from sklearn.gaussian_process.kernels import RBF, DotProduct, Matern, RationalQuadratic, WhiteKernel + +from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis +from sklearn.neural_network import MLPClassifier + +from sklearn.svm import SVC +from xgboost import XGBClassifier +from sklearn.naive_bayes import MultinomialNB +from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder + +from sklearn.compose import ColumnTransformer +from sklearn.compose import make_column_transformer + +from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score +from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report + +from sklearn.model_selection import train_test_split, cross_validate, cross_val_score +from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold + +from sklearn.pipeline import Pipeline, make_pipeline + +from sklearn.feature_selection import RFE, RFECV + +import itertools +import seaborn as sns +import matplotlib.pyplot as plt + +from statistics import mean, stdev, median, mode + +from imblearn.over_sampling import RandomOverSampler +from imblearn.under_sampling import RandomUnderSampler +from imblearn.over_sampling import SMOTE +from sklearn.datasets import make_classification +from imblearn.combine import SMOTEENN +from imblearn.combine import SMOTETomek + +from imblearn.over_sampling import SMOTENC +from imblearn.under_sampling import EditedNearestNeighbours +from imblearn.under_sampling import RepeatedEditedNearestNeighbours + +from sklearn.model_selection import GridSearchCV +from sklearn.base import BaseEstimator +from sklearn.impute import KNNImputer as KNN +import json +############################################################################## # other vars rs = {'random_state': 42} @@ -119,10 +185,25 @@ def run_all_ML(input_pd, target_label, blind_test_input_df, blind_test_target, p _roc_auc = round(roc_auc_score(y_pred, y), 3) _tn, _fp, _fn, _tp = confusion_matrix(y_pred, y).ravel() - result_pd = result_pd.append(pd.DataFrame(np.column_stack([name, _tp, _tn, _fp, _fn, _roc_auc, _mcc, _bacc, _f1]),\ - columns=['estimator', 'TP', 'TN', 'FP', 'FN', - 'roc_auc', 'matthew', 'bacc', 'f1']),\ - ignore_index=True) + # result_pd = result_pd.append(pd.DataFrame(np.column_stack([name + # , _tp, _tn + # , _fp , _fn + # , _roc_auc + # , _mcc + # , _bacc, _f1]),\ + # columns=['estimator', 'TP', 'TN', 'FP', 'FN', + # 'roc_auc', 'matthew', 'bacc', 'f1']),\ + # ignore_index=True) + + result_pd = result_pd.append(pd.DataFrame(np.column_stack([name + , _mcc + , _roc_auc + , _bacc, _f1 + , _tp, _tn + , _fp , _fn]),\ + columns=['estimator', 'matthew', 'roc_auc', 'bacc', 'f1',\ + 'TP', 'TN', 'FP', 'FN']),\ + ignore_index=True) #========================= # Blind test: BTS results #========================= @@ -140,15 +221,24 @@ def run_all_ML(input_pd, target_label, blind_test_input_df, blind_test_target, p _roc_aucBTS = round(roc_auc_score(bts_predict, blind_test_target), 3) _tnBTS, _fpBTS, _fnBTS, _tpBTS = confusion_matrix(bts_predict, blind_test_target).ravel() + # result_bts_pd = result_bts_pd.append(pd.DataFrame(np.column_stack([name + # , _tpBTS, _tnBTS + # , _fpBTS, _fnBTS + # , _roc_aucBTS + # , _mccBTS + # , _baccBTS, _f1BTS]),\ + # columns=['estimator', 'TP', 'TN', 'FP', 'FN', + # 'roc_auc', 'matthew', 'bacc', 'f1']),\ + # ignore_index=True) result_bts_pd = result_bts_pd.append(pd.DataFrame(np.column_stack([name - , _tpBTS, _tnBTS - , _fpBTS, _fnBTS - , _roc_aucBTS , _mccBTS - , _baccBTS, _f1BTS]),\ - columns=['estimator', 'TP', 'TN', 'FP', 'FN', - 'roc_auc', 'matthew', 'bacc', 'f1']),\ - ignore_index=True) + , _roc_aucBTS + , _baccBTS, _f1BTS + , _tpBTS, _tnBTS + , _fpBTS, _fnBTS]),\ + columns=['estimator','matthew', 'roc_auc', 'bacc', 'f1',\ + 'TP', 'TN', 'FP', 'FN']),\ + ignore_index=True) results_all['CrossValResultsDF'] = result_pd @@ -165,15 +255,13 @@ def run_all_ML(input_pd, target_label, blind_test_input_df, blind_test_target, p #%% CALL function #run_all_ML(input_pd=X, target_label=y, blind_test_input_df=X_bts, blind_test_target=y_bts, preprocess = True, var_type = 'mixed') +# Baseline_data YC_resD2 = run_all_ML(input_pd=X, target_label=y, blind_test_input_df=X_bts, blind_test_target=y_bts, preprocess = True, var_type = 'mixed') - -YC_resD_ros = run_all_ML(input_pd=X_ros, target_label=y_ros, blind_test_input_df=X_bts, blind_test_target=y_bts, preprocess = True, var_type = 'mixed') - -CVResultsDF = YC_resD2['CrossValResultsDF'] -CVResultsDF.sort_values(by=['matthew'], ascending=False, inplace=True) -BTSResultsDF = YC_resD2['BlindTestResultsDF'] -BTSResultsDF.sort_values(by=['matthew'], ascending=False, inplace=True) +CVResultsDF_baseline = YC_resD2['CrossValResultsDF'] +CVResultsDF_baseline.sort_values(by=['matthew'], ascending=False, inplace=True) +BTSResultsDF_baseline = YC_resD2['BlindTestResultsDF'] +BTSResultsDF_baseline.sort_values(by=['matthew'], ascending=False, inplace=True) # from sklearn.utils import all_estimators # for name, algorithm in all_estimators(type_filter="classifier"):