diff --git a/MultModelsCl_CALL.py b/MultModelsCl_CALL.py new file mode 100644 index 0000000..c285f0a --- /dev/null +++ b/MultModelsCl_CALL.py @@ -0,0 +1,130 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Mar 15 11:09:50 2022 + +@author: tanu +""" + +#%% MultModelsCl: function call() +mm_skf_scoresD = MultModelsCl(input_df = X + , target = y + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) + +baseline_all = pd.DataFrame(mm_skf_scoresD) +baseline_all = baseline_all.T +#baseline_train = baseline_all.filter(like='train_', axis=1) +baseline_CT = baseline_all.filter(like='test_', axis=1) +baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +baseline_BT = baseline_all.filter(like='bts_', axis=1) +baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) +#%% SMOTE OS: Numerical only +# mm_skf_scoresD2 = MultModelsCl(input_df = X_sm +# , target = y_sm +# , var_type = 'numerical' +# , skf_cv = skf_cv) +# sm_all = pd.DataFrame(mm_skf_scoresD2) +# sm_all = sm_all.T + +# sm_CT = sm_all.filter(like='test_', axis=1) +#sm_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +# sm_BT = sm_all.filter(like='bts_', axis=1) +#sm_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +#%% SMOTE ENN: Over + Undersampling combined: Numerical ONLY +# mm_skf_scoresD5 = MultModelsCl(input_df = X_enn +# , target = y_enn +# , var_type = 'numerical' +# , skf_cv = skf_cv +# , blind_test_input_df = X_bts +# , blind_test_target = y_bts) +# enn_all = pd.DataFrame(mm_skf_scoresD5) +# enn_all = enn_all.T + +# enn_CT = enn_all.filter(like='test_', axis=1) +#enn_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +# enn_BT = enn_all.filter(like='bts_', axis=1) +#enn_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) +#%% SMOTE NC: Oversampling [Numerical + categorical] +mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc + , target = y_smnc + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) +smnc_all = pd.DataFrame(mm_skf_scoresD7) +smnc_all = smnc_all.T + +smnc_CT = smnc_all.filter(like='test_', axis=1) +smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +smnc_BT = smnc_all.filter(like='bts_', axis=1) +smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) +#%% ROS: Numerical + categorical +mm_skf_scoresD3 = MultModelsCl(input_df = X_ros + , target = y_ros + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) +ros_all = pd.DataFrame(mm_skf_scoresD3) +ros_all = ros_all.T + +ros_CT = ros_all.filter(like='test_', axis=1) +ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +ros_BT = ros_all.filter(like='bts_', axis=1) +ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) +#%% RUS: Numerical + categorical +mm_skf_scoresD4 = MultModelsCl(input_df = X_rus + , target = y_rus + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) +rus_all = pd.DataFrame(mm_skf_scoresD4) +rus_all = rus_all.T + +rus_CT = rus_all.filter(like='test_', axis=1) +rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +rus_BT = rus_all.filter(like='bts_' , axis=1) +rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) +#%% ROS + RUS Combined: Numerical + categorical +mm_skf_scoresD8= MultModelsCl(input_df = X_rouC + , target = y_rouC + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) +rouC_all = pd.DataFrame(mm_skf_scoresD8) +rouC_all = rouC_all.T + +rouC_CT = ros_all.filter(like='test_', axis=1) +rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +rouC_BT = ros_all.filter(like='bts_', axis=1) +rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) +#%% +# mm_skf_scoresD6 = MultModelsCl(input_df = X_renn +# , target = y_renn +# , var_type = 'numerical' +# , skf_cv = skf_cv +# , blind_test_input_df = X_bts +# , blind_test_target = y_bts) +# renn_all = pd.DataFrame(mm_skf_scoresD6) +# renn_all = renn_all.T + +# renn_CT = renn_all.filter(like='test_', axis=1) +#renn_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +# renn_BT = renn_all.filter(like='bts_', axis=1) +# renn_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + + diff --git a/UQ_MultModelsCl.py b/UQ_MultModelsCl.py new file mode 100644 index 0000000..e5b55b6 --- /dev/null +++ b/UQ_MultModelsCl.py @@ -0,0 +1,268 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Fri Mar 4 15:25:33 2022 + +@author: tanu +""" +#%% + +import os, sys +import pandas as pd +import numpy as np +import pprint as pp +#from copy import deepcopy +from sklearn import linear_model +from sklearn.linear_model import LogisticRegression, LinearRegression +from sklearn.naive_bayes import BernoulliNB +from sklearn.neighbors import KNeighborsClassifier +from sklearn.svm import SVC +from sklearn.tree import DecisionTreeClassifier +from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier +from sklearn.neural_network import MLPClassifier +from xgboost import XGBClassifier +from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder + +from sklearn.compose import ColumnTransformer +from sklearn.compose import make_column_transformer + +from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score +from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score +from sklearn.metrics import make_scorer +from sklearn.metrics import classification_report + +from sklearn.metrics import average_precision_score + +from sklearn.model_selection import cross_validate +from sklearn.model_selection import train_test_split +from sklearn.model_selection import StratifiedKFold + +from sklearn.pipeline import Pipeline +from sklearn.pipeline import make_pipeline + +from sklearn.feature_selection import RFE +from sklearn.feature_selection import RFECV +import itertools +import seaborn as sns +import matplotlib.pyplot as plt +import numpy as np +print(np.__version__) +print(pd.__version__) +from statistics import mean, stdev, median, mode + +from imblearn.over_sampling import RandomOverSampler +from imblearn.over_sampling import SMOTE +from imblearn.pipeline import Pipeline +from sklearn.datasets import make_classification +from sklearn.model_selection import cross_validate +from sklearn.model_selection import RepeatedStratifiedKFold +from sklearn.ensemble import AdaBoostClassifier +from imblearn.combine import SMOTEENN +from imblearn.under_sampling import EditedNearestNeighbours + +from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis +from sklearn.neural_network import MLPClassifier + +from sklearn.linear_model import RidgeClassifier, SGDClassifier, PassiveAggressiveClassifier +from sklearn.discriminant_analysis import LinearDiscriminantAnalysis +from sklearn.svm import SVC +from xgboost import XGBClassifier +from sklearn.naive_bayes import MultinomialNB +from sklearn.naive_bayes import GaussianNB + +from sklearn.linear_model import SGDClassifier +from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder +from sklearn.utils import all_estimators + +from sklearn.linear_model import LogisticRegression, LogisticRegressionCV + +#%% +rs = {'random_state': 42} +njobs = {'n_jobs': 10} + +scoring_fn = ({ 'mcc' : make_scorer(matthews_corrcoef) + , 'fscore' : make_scorer(f1_score) + , 'precision' : make_scorer(precision_score) + , 'recall' : make_scorer(recall_score) + , 'accuracy' : make_scorer(accuracy_score) + , 'roc_auc' : make_scorer(roc_auc_score) + , 'jaccard' : make_scorer(jaccard_score) + }) + +#%% +# Multiple Classification - Model Pipeline +def MultModelsCl(input_df, target, skf_cv + , blind_test_input_df + , blind_test_target + , var_type = ['numerical', 'categorical','mixed']): + + ''' + @ param input_df: input features + @ type: df with input features WITHOUT the target variable + + @param target: target (or output) feature + @type: df or np.array or Series + + @param skv_cv: stratifiedK fold int or object to allow shuffle and random state to pass + @type: int or StratifiedKfold() + + @var_type: numerical, categorical and mixed to determine what col_transform to apply (MinMaxScalar and/or one-ho t encoder) + @type: list + + returns + Dict containing multiple classification scores for each model and mean of each Stratified Kfold including training + + ''' + # determine categorical and numerical features + numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns + numerical_ix + categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns + categorical_ix + + # Determine preprocessing steps ~ var_type + if var_type == 'numerical': + t = [('num', MinMaxScaler(), numerical_ix)] + + if var_type == 'categorical': + t = [('cat', OneHotEncoder(), categorical_ix)] + + if var_type == 'mixed': + t = [('num', MinMaxScaler(), numerical_ix) + , ('cat', OneHotEncoder(), categorical_ix) ] + + col_transform = ColumnTransformer(transformers = t + , remainder='passthrough') + + #%% Specify multiple Classification models + lr = LogisticRegression(**rs) + lrcv = LogisticRegressionCV(**rs) + gnb = GaussianNB() + nb = BernoulliNB() + knn = KNeighborsClassifier() + svc = SVC(**rs) + mlp = MLPClassifier(max_iter = 500, **rs) + dt = DecisionTreeClassifier(**rs) + ets = ExtraTreesClassifier(**rs) + + rf = RandomForestClassifier(**rs, n_estimators = 1000 ) + rf2 = RandomForestClassifier( + min_samples_leaf = 5 + , n_estimators = 1000 + , bootstrap = True + , oob_score = True + , **njobs + , **rs + , max_features = 'auto') + xgb = XGBClassifier(**rs, verbosity = 0, use_label_encoder =False) + + lda = LinearDiscriminantAnalysis() + + mnb = MultinomialNB() + + pa = PassiveAggressiveClassifier(**rs, **njobs) + + sgd = SGDClassifier(**rs, **njobs) + + abc = AdaBoostClassifier(**rs) + bc = BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True) + et = ExtraTreeClassifier(**rs) + gpc = GaussianProcessClassifier(**rs) + gbc = GradientBoostingClassifier(**rs) + qda = QuadraticDiscriminantAnalysis() + rc = RidgeClassifier(**rs) + rccv = RidgeClassifierCV(cv = 10) + + models = [('Logistic Regression' , lr) + , ('Logistic RegressionCV' , lrcv) + , ('Gaussian NB' , gnb) + , ('Naive Bayes' , nb) + , ('K-Nearest Neighbors' , knn) + , ('SVM' , svc) + , ('MLP' , mlp) + , ('Decision Tree' , dt) + , ('Extra Trees' , ets) + , ('Extra Tree' , et) + , ('Random Forest' , rf) + , ('Random Forest2' , rf2) + , ('Naive Bayes' , nb) + , ('XGBoost' , xgb) + , ('LDA' , lda) + , ('Multinomial' , mnb) + , ('Passive Aggresive' , pa) + , ('Stochastic GDescent' , sgd) + , ('AdaBoost Classifier' , abc) + , ('Bagging Classifier' , bc) + , ('Gaussian Process' , gpc) + , ('Gradient Boosting' , gbc) + , ('QDA' , qda) + , ('Ridge Classifier' , rc) + , ('Ridge ClassifierCV' , rccv) + ] + + mm_skf_scoresD = {} + + for model_name, model_fn in models: + print('\nModel_name:', model_name + , '\nModel func:' , model_fn + , '\nList of models:', models) + + model_pipeline = Pipeline([ + ('prep' , col_transform) + , ('model' , model_fn)]) + + print('Running model pipeline:', model_pipeline) + skf_cv_mod = cross_validate(model_pipeline + , input_df + , target + , cv = skf_cv + , scoring = scoring_fn + , return_train_score = True) + mm_skf_scoresD[model_name] = {} + for key, value in skf_cv_mod.items(): + print('\nkey:', key, '\nvalue:', value) + print('\nmean value:', mean(value)) + mm_skf_scoresD[model_name][key] = round(mean(value),2) + #pp.pprint(mm_skf_scoresD) + #cvtrain_mcc = mm_skf_scoresD[model_name]['test_mcc'] + + #return(mm_skf_scoresD) +#%% + #========================= + # Blind test: BTS results + #========================= + # Build the final results with all scores for a feature selected model + #bts_predict = gscv_fs.predict(blind_test_input_df) + model_pipeline.fit(input_df, target) + bts_predict = model_pipeline.predict(blind_test_input_df) + + bts_mcc_score = round(matthews_corrcoef(blind_test_target, bts_predict),2) + print('\nMCC on Blind test:' , bts_mcc_score) + print('\nAccuracy on Blind test:', round(accuracy_score(blind_test_target, bts_predict),2)) + + # Diff b/w train and bts test scores + #train_test_diff_MCC = cvtrain_mcc - bts_mcc_score + # print('\nDiff b/w train and blind test score (MCC):', train_test_diff) + + + # # create a dict with all scores + # lr_btsD = { 'model_name': model_name + # , 'bts_mcc':None + # , 'bts_fscore':None + # , 'bts_precision':None + # , 'bts_recall':None + # , 'bts_accuracy':None + # , 'bts_roc_auc':None + # , 'bts_jaccard':None} + + + mm_skf_scoresD[model_name]['bts_mcc'] = bts_mcc_score + mm_skf_scoresD[model_name]['bts_fscore'] = round(f1_score(blind_test_target, bts_predict),2) + mm_skf_scoresD[model_name]['bts_precision'] = round(precision_score(blind_test_target, bts_predict),2) + mm_skf_scoresD[model_name]['bts_recall'] = round(recall_score(blind_test_target, bts_predict),2) + mm_skf_scoresD[model_name]['bts_accuracy'] = round(accuracy_score(blind_test_target, bts_predict),2) + mm_skf_scoresD[model_name]['bts_roc_auc'] = round(roc_auc_score(blind_test_target, bts_predict),2) + mm_skf_scoresD[model_name]['bts_jaccard'] = round(jaccard_score(blind_test_target, bts_predict),2) + #mm_skf_scoresD[model_name]['diff_mcc'] = train_test_diff_MCC + + return(mm_skf_scoresD) + diff --git a/pnca_config.py b/pnca_config.py new file mode 100644 index 0000000..705fecf --- /dev/null +++ b/pnca_config.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Sat May 28 05:25:30 2022 + +@author: tanu +""" +import os, sys + +def MyGlobalVars(): + global gene + global drug + global homedir + gene = 'pncA' + drug = 'pyrazinamide' + homedir = os.path.expanduser("~") + +MyGlobalVars() + +os.chdir(homedir + "/git/ML_AI_training/") + +# my function +from UQ_MultClassPipe4 import MultClassPipeSKFCV +from UQ_pnca_ML.py import * + +#from scriptsfymcn import run_all_ML + + +# YC_resD2 = run_all_ML(input_pd=X, target_label=y, blind_test_input_df=X_bts, blind_test_target=y_bts, preprocess = True, var_type = 'mixed') + +# CVResultsDF = YC_resD2['CrossValResultsDF'] +# CVResultsDF.sort_values(by=['matthew'], ascending=False, inplace=True) +# BTSResultsDF = YC_resD2['BlindTestResultsDF'] +# BTSResultsDF.sort_values(by=['matthew'], ascending=False, inplace=True) +