diff --git a/MultClassPipe3.py b/MultClassPipe3.py index 5c7a780..ac78440 100644 --- a/MultClassPipe3.py +++ b/MultClassPipe3.py @@ -60,6 +60,18 @@ from sklearn.ensemble import AdaBoostClassifier from imblearn.combine import SMOTEENN from imblearn.under_sampling import EditedNearestNeighbours +from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis +from sklearn.neural_network import MLPClassifier + +from sklearn.linear_model import RidgeClassifier, SGDClassifier, PassiveAggressiveClassifier +from sklearn.discriminant_analysis import LinearDiscriminantAnalysis +from sklearn.svm import SVC +from xgboost import XGBClassifier +from sklearn.naive_bayes import MultinomialNB +from sklearn.linear_model import SGDClassifier +from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder + + #%% rs = {'random_state': 42} njobs = {'n_jobs': 10} @@ -122,8 +134,7 @@ def MultClassPipeSKFCV(input_df, target, skf_cv, var_type = ['numerical', 'categ mlp = MLPClassifier(max_iter = 500, **rs) dt = DecisionTreeClassifier(**rs) et = ExtraTreesClassifier(**rs) - rf = RandomForestClassifier(**rs, - n_estimators = 1000 ) + rf = RandomForestClassifier(**rs, n_estimators = 1000 ) rf2 = RandomForestClassifier( min_samples_leaf = 5 , n_estimators = 100 #10 @@ -136,7 +147,7 @@ def MultClassPipeSKFCV(input_df, target, skf_cv, var_type = ['numerical', 'categ lda = LinearDiscriminantAnalysis() - mnb = MultinomialNB(**rs) + mnb = MultinomialNB() pa = PassiveAggressiveClassifier(**rs, **njobs) diff --git a/MultClassPipe3_CALL.py b/MultClassPipe3_CALL.py index c9e1032..a2d18c1 100644 --- a/MultClassPipe3_CALL.py +++ b/MultClassPipe3_CALL.py @@ -7,7 +7,7 @@ Created on Tue Mar 15 11:09:50 2022 """ #%% Data X = all_df_wtgt[numerical_FN+categorical_FN] -y = all_df_wtgt['mutation_class'] +y = all_df_wtgt[drug] #y = all_df_wtgt['dst_mode'] #%% variables @@ -17,9 +17,40 @@ mm_skf_scoresD = MultClassPipeSKFCV(input_df = X , var_type = 'mixed' , skf_cv = skf_cv) - mm_skf_scores_df_all = pd.DataFrame(mm_skf_scoresD) mm_skf_scores_df_all mm_skf_scores_df_test = mm_skf_scores_df_all.filter(like='test_', axis=0) mm_skf_scores_df_train = mm_skf_scores_df_all.filter(like='train_', axis=0) # helps to see if you trust the results -] + +#%% +mm_skf_scoresD2 = MultClassPipeSKFCV(input_df = X_sm + , target = y_sm + , var_type = 'mixed' + , skf_cv = skf_cv) +sm_all = pd.DataFrame(mm_skf_scoresD2) +sm_df_CT = sm_all.filter(like='test_', axis=0) + +#%% +mm_skf_scoresD3 = MultClassPipeSKFCV(input_df = X_ros + , target = y_ros + , var_type = 'mixed' + , skf_cv = skf_cv) +ros_all = pd.DataFrame(mm_skf_scoresD3) +ros_CT = ros_all.filter(like='test_', axis=0) + +#%% +mm_skf_scoresD4 = MultClassPipeSKFCV(input_df = X_rus + , target = y_rus + , var_type = 'mixed' + , skf_cv = skf_cv) +rus_all = pd.DataFrame(mm_skf_scoresD4) +rus_CT = rus_all.filter(like='test_', axis=0) + +#%% +mm_skf_scoresD5 = MultClassPipeSKFCV(input_df = X_enn + , target = y_enn + , var_type = 'mixed' + , skf_cv = skf_cv) +enn_all = pd.DataFrame(mm_skf_scoresD5) +enn_CT = enn_all.filter(like='test_', axis=0) + diff --git a/UQ_pnca_ML.py b/UQ_pnca_ML.py index d0a4099..7d5e4fe 100644 --- a/UQ_pnca_ML.py +++ b/UQ_pnca_ML.py @@ -36,7 +36,8 @@ from sklearn.gaussian_process.kernels import WhiteKernel from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis from sklearn.neural_network import MLPClassifier -from sklearn.linear_model import RidgeClassifier +from sklearn.linear_model import RidgeClassifier, SGDClassifier, PassiveAggressiveClassifier +from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.svm import SVC from xgboost import XGBClassifier from sklearn.naive_bayes import MultinomialNB @@ -72,6 +73,7 @@ print(pd.__version__) from statistics import mean, stdev, median, mode from imblearn.over_sampling import RandomOverSampler +from imblearn.under_sampling import RandomUnderSampler from imblearn.over_sampling import SMOTE from imblearn.pipeline import Pipeline from sklearn.datasets import make_classification @@ -81,6 +83,7 @@ from sklearn.ensemble import AdaBoostClassifier from imblearn.combine import SMOTEENN from imblearn.under_sampling import EditedNearestNeighbours + from sklearn.model_selection import GridSearchCV from sklearn.base import BaseEstimator import json @@ -119,6 +122,10 @@ from MultClassPipe3 import MultClassPipeSKFCV gene = 'pncA' drug = 'pyrazinamide' +#gene = 'katG' +#drug = 'isoniazid' + + #============== # directories #============== @@ -234,13 +241,13 @@ numerical_FN = common_cols_stabiltyN + foldX_cols + X_strFN + X_evolFN + X_genom #categorical feature names categorical_FN = ['ss_class' - , 'wt_prop_water' +# , 'wt_prop_water' # , 'lineage_labels' # misleading if using merged_df3 - , 'mut_prop_water' - , 'wt_prop_polarity' - , 'mut_prop_polarity' - , 'wt_calcprop' - , 'mut_calcprop' +# , 'mut_prop_water' +# , 'wt_prop_polarity' +# , 'mut_prop_polarity' +# , 'wt_calcprop' +# , 'mut_calcprop' #, 'active_aa_pos' ] @@ -278,9 +285,9 @@ all_df_wtgt.shape #------ # X #------ -X = all_df_wtgt[numerical_FN + categorical_FN] # training data ALL +#X = all_df_wtgt[numerical_FN + categorical_FN] # training data ALL X_bts = blind_test_df[numerical_FN + categorical_FN] # blind test data ALL -#X = all_df_wtgt[numerical_FN] # training numerical only +X = all_df_wtgt[numerical_FN] # training numerical only #X_bts = blind_test_df[numerical_FN] # blind test data numerical #------ diff --git a/classification_params_FS.py b/classification_params_FS.py index 12f53bd..e5b4891 100644 --- a/classification_params_FS.py +++ b/classification_params_FS.py @@ -574,6 +574,7 @@ param_grid_svc = [ #======== # LDA # https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/lda.py +# https://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.LinearDiscriminantAnalysis.html #======== estimator = LinearDiscriminantAnalysis() @@ -605,9 +606,10 @@ param_grid_lda = [ #======== # Multinomial_nb # https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/multinomial_nb.py +# https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html #======== -estimator = MultinomialNB(**rs) +estimator = MultinomialNB() # Define pipleline with steps pipe_mnb = Pipeline([ @@ -635,6 +637,7 @@ param_grid_mnb = [ #======== # passive_aggressive # https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/passive_aggressive.py +# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.PassiveAggressiveClassifier.html #======== estimator = PassiveAggressiveClassifier(**rs, **njobs) @@ -668,6 +671,7 @@ param_grid_pa = [ #======== # SGD # https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/sgd.py +# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html #======== estimator = SGDClassifier(**rs, **njobs) diff --git a/uq_ml_models_FS/scriptfsycm.py b/uq_ml_models_FS/scriptfsycm.py new file mode 100644 index 0000000..4125c12 --- /dev/null +++ b/uq_ml_models_FS/scriptfsycm.py @@ -0,0 +1,60 @@ +import pandas as pd +import numpy as np +import scipy as sp +import time +import sys +import os +import re +import argparse +from math import sqrt +from scipy import stats +import joblib +# Alogorithm +from xgboost.sklearn import XGBClassifier +from sklearn import svm +from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier +from sklearn.gaussian_process import GaussianProcessClassifier +from sklearn.ensemble import AdaBoostClassifier +from sklearn.neighbors import KNeighborsClassifier +from sklearn.neural_network import MLPRegressor +from sklearn.utils import all_estimators +# Pre-processing +from sklearn import preprocessing +from sklearn.preprocessing import StandardScaler +from sklearn.datasets import make_classification +from sklearn.pipeline import Pipeline, make_pipeline +from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, LeaveOneOut, KFold, RepeatedKFold, cross_val_predict +# Metric +from sklearn.metrics import mean_squared_error, make_scorer, roc_auc_score, f1_score, matthews_corrcoef, accuracy_score, balanced_accuracy_score, confusion_matrix, classification_report + +def run_all_ML(input_pd, target_label): + #y = input_pd[target_label] + #X = input_pd.drop(target_label,axis=1) + y = target_label + X = input_pd + + result_pd = pd.DataFrame() + for name, algorithm in all_estimators(type_filter="classifier"): + try: + estmator = algorithm() + temp_pd = pd.DataFrame() + temp_cm = pd.DataFrame() + + pipe = Pipeline([ + ("model", algorithm()) + ]) + y_pred = cross_val_predict(pipe, X, y, cv = 10, n_jobs=10) + _mcc = round(matthews_corrcoef(y_pred, y), 3) + _bacc = round(balanced_accuracy_score(y_pred, y), 3) + _f1 = round(f1_score(y_pred, y), 3) + _roc_auc = round(roc_auc_score(y_pred, y), 3) + _tn, _fp, _fn, _tp = confusion_matrix(y_pred, y).ravel() + + result_pd = result_pd.append(pd.DataFrame(np.column_stack([name, _tp, _tn, _fp, _fn, _roc_auc, _mcc, _bacc, _f1]),\ + columns=['estimator', 'TP', 'TN', 'FP', 'FN', + 'roc_auc', 'matthew', 'bacc', 'f1']),\ + ignore_index=True) + except Exception as e: + print("Got an error while running {}".format(name)) + print(e) + return(result_pd)