trying under and oversampling
This commit is contained in:
parent
8f8306d948
commit
5779331981
5 changed files with 129 additions and 16 deletions
|
@ -60,6 +60,18 @@ from sklearn.ensemble import AdaBoostClassifier
|
||||||
from imblearn.combine import SMOTEENN
|
from imblearn.combine import SMOTEENN
|
||||||
from imblearn.under_sampling import EditedNearestNeighbours
|
from imblearn.under_sampling import EditedNearestNeighbours
|
||||||
|
|
||||||
|
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
|
||||||
|
from sklearn.neural_network import MLPClassifier
|
||||||
|
|
||||||
|
from sklearn.linear_model import RidgeClassifier, SGDClassifier, PassiveAggressiveClassifier
|
||||||
|
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
|
||||||
|
from sklearn.svm import SVC
|
||||||
|
from xgboost import XGBClassifier
|
||||||
|
from sklearn.naive_bayes import MultinomialNB
|
||||||
|
from sklearn.linear_model import SGDClassifier
|
||||||
|
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
|
||||||
|
|
||||||
|
|
||||||
#%%
|
#%%
|
||||||
rs = {'random_state': 42}
|
rs = {'random_state': 42}
|
||||||
njobs = {'n_jobs': 10}
|
njobs = {'n_jobs': 10}
|
||||||
|
@ -122,8 +134,7 @@ def MultClassPipeSKFCV(input_df, target, skf_cv, var_type = ['numerical', 'categ
|
||||||
mlp = MLPClassifier(max_iter = 500, **rs)
|
mlp = MLPClassifier(max_iter = 500, **rs)
|
||||||
dt = DecisionTreeClassifier(**rs)
|
dt = DecisionTreeClassifier(**rs)
|
||||||
et = ExtraTreesClassifier(**rs)
|
et = ExtraTreesClassifier(**rs)
|
||||||
rf = RandomForestClassifier(**rs,
|
rf = RandomForestClassifier(**rs, n_estimators = 1000 )
|
||||||
n_estimators = 1000 )
|
|
||||||
rf2 = RandomForestClassifier(
|
rf2 = RandomForestClassifier(
|
||||||
min_samples_leaf = 5
|
min_samples_leaf = 5
|
||||||
, n_estimators = 100 #10
|
, n_estimators = 100 #10
|
||||||
|
@ -136,7 +147,7 @@ def MultClassPipeSKFCV(input_df, target, skf_cv, var_type = ['numerical', 'categ
|
||||||
|
|
||||||
lda = LinearDiscriminantAnalysis()
|
lda = LinearDiscriminantAnalysis()
|
||||||
|
|
||||||
mnb = MultinomialNB(**rs)
|
mnb = MultinomialNB()
|
||||||
|
|
||||||
pa = PassiveAggressiveClassifier(**rs, **njobs)
|
pa = PassiveAggressiveClassifier(**rs, **njobs)
|
||||||
|
|
||||||
|
|
|
@ -7,7 +7,7 @@ Created on Tue Mar 15 11:09:50 2022
|
||||||
"""
|
"""
|
||||||
#%% Data
|
#%% Data
|
||||||
X = all_df_wtgt[numerical_FN+categorical_FN]
|
X = all_df_wtgt[numerical_FN+categorical_FN]
|
||||||
y = all_df_wtgt['mutation_class']
|
y = all_df_wtgt[drug]
|
||||||
#y = all_df_wtgt['dst_mode']
|
#y = all_df_wtgt['dst_mode']
|
||||||
#%% variables
|
#%% variables
|
||||||
|
|
||||||
|
@ -17,9 +17,40 @@ mm_skf_scoresD = MultClassPipeSKFCV(input_df = X
|
||||||
, var_type = 'mixed'
|
, var_type = 'mixed'
|
||||||
, skf_cv = skf_cv)
|
, skf_cv = skf_cv)
|
||||||
|
|
||||||
|
|
||||||
mm_skf_scores_df_all = pd.DataFrame(mm_skf_scoresD)
|
mm_skf_scores_df_all = pd.DataFrame(mm_skf_scoresD)
|
||||||
mm_skf_scores_df_all
|
mm_skf_scores_df_all
|
||||||
mm_skf_scores_df_test = mm_skf_scores_df_all.filter(like='test_', axis=0)
|
mm_skf_scores_df_test = mm_skf_scores_df_all.filter(like='test_', axis=0)
|
||||||
mm_skf_scores_df_train = mm_skf_scores_df_all.filter(like='train_', axis=0) # helps to see if you trust the results
|
mm_skf_scores_df_train = mm_skf_scores_df_all.filter(like='train_', axis=0) # helps to see if you trust the results
|
||||||
]
|
|
||||||
|
#%%
|
||||||
|
mm_skf_scoresD2 = MultClassPipeSKFCV(input_df = X_sm
|
||||||
|
, target = y_sm
|
||||||
|
, var_type = 'mixed'
|
||||||
|
, skf_cv = skf_cv)
|
||||||
|
sm_all = pd.DataFrame(mm_skf_scoresD2)
|
||||||
|
sm_df_CT = sm_all.filter(like='test_', axis=0)
|
||||||
|
|
||||||
|
#%%
|
||||||
|
mm_skf_scoresD3 = MultClassPipeSKFCV(input_df = X_ros
|
||||||
|
, target = y_ros
|
||||||
|
, var_type = 'mixed'
|
||||||
|
, skf_cv = skf_cv)
|
||||||
|
ros_all = pd.DataFrame(mm_skf_scoresD3)
|
||||||
|
ros_CT = ros_all.filter(like='test_', axis=0)
|
||||||
|
|
||||||
|
#%%
|
||||||
|
mm_skf_scoresD4 = MultClassPipeSKFCV(input_df = X_rus
|
||||||
|
, target = y_rus
|
||||||
|
, var_type = 'mixed'
|
||||||
|
, skf_cv = skf_cv)
|
||||||
|
rus_all = pd.DataFrame(mm_skf_scoresD4)
|
||||||
|
rus_CT = rus_all.filter(like='test_', axis=0)
|
||||||
|
|
||||||
|
#%%
|
||||||
|
mm_skf_scoresD5 = MultClassPipeSKFCV(input_df = X_enn
|
||||||
|
, target = y_enn
|
||||||
|
, var_type = 'mixed'
|
||||||
|
, skf_cv = skf_cv)
|
||||||
|
enn_all = pd.DataFrame(mm_skf_scoresD5)
|
||||||
|
enn_CT = enn_all.filter(like='test_', axis=0)
|
||||||
|
|
||||||
|
|
|
@ -36,7 +36,8 @@ from sklearn.gaussian_process.kernels import WhiteKernel
|
||||||
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
|
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
|
||||||
from sklearn.neural_network import MLPClassifier
|
from sklearn.neural_network import MLPClassifier
|
||||||
|
|
||||||
from sklearn.linear_model import RidgeClassifier
|
from sklearn.linear_model import RidgeClassifier, SGDClassifier, PassiveAggressiveClassifier
|
||||||
|
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
|
||||||
from sklearn.svm import SVC
|
from sklearn.svm import SVC
|
||||||
from xgboost import XGBClassifier
|
from xgboost import XGBClassifier
|
||||||
from sklearn.naive_bayes import MultinomialNB
|
from sklearn.naive_bayes import MultinomialNB
|
||||||
|
@ -72,6 +73,7 @@ print(pd.__version__)
|
||||||
from statistics import mean, stdev, median, mode
|
from statistics import mean, stdev, median, mode
|
||||||
|
|
||||||
from imblearn.over_sampling import RandomOverSampler
|
from imblearn.over_sampling import RandomOverSampler
|
||||||
|
from imblearn.under_sampling import RandomUnderSampler
|
||||||
from imblearn.over_sampling import SMOTE
|
from imblearn.over_sampling import SMOTE
|
||||||
from imblearn.pipeline import Pipeline
|
from imblearn.pipeline import Pipeline
|
||||||
from sklearn.datasets import make_classification
|
from sklearn.datasets import make_classification
|
||||||
|
@ -81,6 +83,7 @@ from sklearn.ensemble import AdaBoostClassifier
|
||||||
from imblearn.combine import SMOTEENN
|
from imblearn.combine import SMOTEENN
|
||||||
from imblearn.under_sampling import EditedNearestNeighbours
|
from imblearn.under_sampling import EditedNearestNeighbours
|
||||||
|
|
||||||
|
|
||||||
from sklearn.model_selection import GridSearchCV
|
from sklearn.model_selection import GridSearchCV
|
||||||
from sklearn.base import BaseEstimator
|
from sklearn.base import BaseEstimator
|
||||||
import json
|
import json
|
||||||
|
@ -119,6 +122,10 @@ from MultClassPipe3 import MultClassPipeSKFCV
|
||||||
gene = 'pncA'
|
gene = 'pncA'
|
||||||
drug = 'pyrazinamide'
|
drug = 'pyrazinamide'
|
||||||
|
|
||||||
|
#gene = 'katG'
|
||||||
|
#drug = 'isoniazid'
|
||||||
|
|
||||||
|
|
||||||
#==============
|
#==============
|
||||||
# directories
|
# directories
|
||||||
#==============
|
#==============
|
||||||
|
@ -234,13 +241,13 @@ numerical_FN = common_cols_stabiltyN + foldX_cols + X_strFN + X_evolFN + X_genom
|
||||||
|
|
||||||
#categorical feature names
|
#categorical feature names
|
||||||
categorical_FN = ['ss_class'
|
categorical_FN = ['ss_class'
|
||||||
, 'wt_prop_water'
|
# , 'wt_prop_water'
|
||||||
# , 'lineage_labels' # misleading if using merged_df3
|
# , 'lineage_labels' # misleading if using merged_df3
|
||||||
, 'mut_prop_water'
|
# , 'mut_prop_water'
|
||||||
, 'wt_prop_polarity'
|
# , 'wt_prop_polarity'
|
||||||
, 'mut_prop_polarity'
|
# , 'mut_prop_polarity'
|
||||||
, 'wt_calcprop'
|
# , 'wt_calcprop'
|
||||||
, 'mut_calcprop'
|
# , 'mut_calcprop'
|
||||||
#, 'active_aa_pos'
|
#, 'active_aa_pos'
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -278,9 +285,9 @@ all_df_wtgt.shape
|
||||||
#------
|
#------
|
||||||
# X
|
# X
|
||||||
#------
|
#------
|
||||||
X = all_df_wtgt[numerical_FN + categorical_FN] # training data ALL
|
#X = all_df_wtgt[numerical_FN + categorical_FN] # training data ALL
|
||||||
X_bts = blind_test_df[numerical_FN + categorical_FN] # blind test data ALL
|
X_bts = blind_test_df[numerical_FN + categorical_FN] # blind test data ALL
|
||||||
#X = all_df_wtgt[numerical_FN] # training numerical only
|
X = all_df_wtgt[numerical_FN] # training numerical only
|
||||||
#X_bts = blind_test_df[numerical_FN] # blind test data numerical
|
#X_bts = blind_test_df[numerical_FN] # blind test data numerical
|
||||||
|
|
||||||
#------
|
#------
|
||||||
|
|
|
@ -574,6 +574,7 @@ param_grid_svc = [
|
||||||
#========
|
#========
|
||||||
# LDA
|
# LDA
|
||||||
# https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/lda.py
|
# https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/lda.py
|
||||||
|
# https://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.LinearDiscriminantAnalysis.html
|
||||||
#========
|
#========
|
||||||
|
|
||||||
estimator = LinearDiscriminantAnalysis()
|
estimator = LinearDiscriminantAnalysis()
|
||||||
|
@ -605,9 +606,10 @@ param_grid_lda = [
|
||||||
#========
|
#========
|
||||||
# Multinomial_nb
|
# Multinomial_nb
|
||||||
# https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/multinomial_nb.py
|
# https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/multinomial_nb.py
|
||||||
|
# https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html
|
||||||
#========
|
#========
|
||||||
|
|
||||||
estimator = MultinomialNB(**rs)
|
estimator = MultinomialNB()
|
||||||
|
|
||||||
# Define pipleline with steps
|
# Define pipleline with steps
|
||||||
pipe_mnb = Pipeline([
|
pipe_mnb = Pipeline([
|
||||||
|
@ -635,6 +637,7 @@ param_grid_mnb = [
|
||||||
#========
|
#========
|
||||||
# passive_aggressive
|
# passive_aggressive
|
||||||
# https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/passive_aggressive.py
|
# https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/passive_aggressive.py
|
||||||
|
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.PassiveAggressiveClassifier.html
|
||||||
#========
|
#========
|
||||||
|
|
||||||
estimator = PassiveAggressiveClassifier(**rs, **njobs)
|
estimator = PassiveAggressiveClassifier(**rs, **njobs)
|
||||||
|
@ -668,6 +671,7 @@ param_grid_pa = [
|
||||||
#========
|
#========
|
||||||
# SGD
|
# SGD
|
||||||
# https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/sgd.py
|
# https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/sgd.py
|
||||||
|
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html
|
||||||
#========
|
#========
|
||||||
|
|
||||||
estimator = SGDClassifier(**rs, **njobs)
|
estimator = SGDClassifier(**rs, **njobs)
|
||||||
|
|
60
uq_ml_models_FS/scriptfsycm.py
Normal file
60
uq_ml_models_FS/scriptfsycm.py
Normal file
|
@ -0,0 +1,60 @@
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import scipy as sp
|
||||||
|
import time
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import argparse
|
||||||
|
from math import sqrt
|
||||||
|
from scipy import stats
|
||||||
|
import joblib
|
||||||
|
# Alogorithm
|
||||||
|
from xgboost.sklearn import XGBClassifier
|
||||||
|
from sklearn import svm
|
||||||
|
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
|
||||||
|
from sklearn.gaussian_process import GaussianProcessClassifier
|
||||||
|
from sklearn.ensemble import AdaBoostClassifier
|
||||||
|
from sklearn.neighbors import KNeighborsClassifier
|
||||||
|
from sklearn.neural_network import MLPRegressor
|
||||||
|
from sklearn.utils import all_estimators
|
||||||
|
# Pre-processing
|
||||||
|
from sklearn import preprocessing
|
||||||
|
from sklearn.preprocessing import StandardScaler
|
||||||
|
from sklearn.datasets import make_classification
|
||||||
|
from sklearn.pipeline import Pipeline, make_pipeline
|
||||||
|
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, LeaveOneOut, KFold, RepeatedKFold, cross_val_predict
|
||||||
|
# Metric
|
||||||
|
from sklearn.metrics import mean_squared_error, make_scorer, roc_auc_score, f1_score, matthews_corrcoef, accuracy_score, balanced_accuracy_score, confusion_matrix, classification_report
|
||||||
|
|
||||||
|
def run_all_ML(input_pd, target_label):
|
||||||
|
#y = input_pd[target_label]
|
||||||
|
#X = input_pd.drop(target_label,axis=1)
|
||||||
|
y = target_label
|
||||||
|
X = input_pd
|
||||||
|
|
||||||
|
result_pd = pd.DataFrame()
|
||||||
|
for name, algorithm in all_estimators(type_filter="classifier"):
|
||||||
|
try:
|
||||||
|
estmator = algorithm()
|
||||||
|
temp_pd = pd.DataFrame()
|
||||||
|
temp_cm = pd.DataFrame()
|
||||||
|
|
||||||
|
pipe = Pipeline([
|
||||||
|
("model", algorithm())
|
||||||
|
])
|
||||||
|
y_pred = cross_val_predict(pipe, X, y, cv = 10, n_jobs=10)
|
||||||
|
_mcc = round(matthews_corrcoef(y_pred, y), 3)
|
||||||
|
_bacc = round(balanced_accuracy_score(y_pred, y), 3)
|
||||||
|
_f1 = round(f1_score(y_pred, y), 3)
|
||||||
|
_roc_auc = round(roc_auc_score(y_pred, y), 3)
|
||||||
|
_tn, _fp, _fn, _tp = confusion_matrix(y_pred, y).ravel()
|
||||||
|
|
||||||
|
result_pd = result_pd.append(pd.DataFrame(np.column_stack([name, _tp, _tn, _fp, _fn, _roc_auc, _mcc, _bacc, _f1]),\
|
||||||
|
columns=['estimator', 'TP', 'TN', 'FP', 'FN',
|
||||||
|
'roc_auc', 'matthew', 'bacc', 'f1']),\
|
||||||
|
ignore_index=True)
|
||||||
|
except Exception as e:
|
||||||
|
print("Got an error while running {}".format(name))
|
||||||
|
print(e)
|
||||||
|
return(result_pd)
|
Loading…
Add table
Add a link
Reference in a new issue