git add UQ_imbalance.py
This commit is contained in:
parent
42c8c47e2d
commit
1da87ba177
4 changed files with 134 additions and 56 deletions
|
@ -21,6 +21,7 @@ mm_skf_scores_df_all = pd.DataFrame(mm_skf_scoresD)
|
||||||
mm_skf_scores_df_all
|
mm_skf_scores_df_all
|
||||||
mm_skf_scores_df_test = mm_skf_scores_df_all.filter(like='test_', axis=0)
|
mm_skf_scores_df_test = mm_skf_scores_df_all.filter(like='test_', axis=0)
|
||||||
mm_skf_scores_df_train = mm_skf_scores_df_all.filter(like='train_', axis=0) # helps to see if you trust the results
|
mm_skf_scores_df_train = mm_skf_scores_df_all.filter(like='train_', axis=0) # helps to see if you trust the results
|
||||||
|
baseline_BT = mm_skf_scores_df_all.filter(like='bts_', axis=0)
|
||||||
|
|
||||||
#%%
|
#%%
|
||||||
mm_skf_scoresD2 = MultClassPipeSKFCV(input_df = X_sm
|
mm_skf_scoresD2 = MultClassPipeSKFCV(input_df = X_sm
|
||||||
|
@ -29,28 +30,70 @@ mm_skf_scoresD2 = MultClassPipeSKFCV(input_df = X_sm
|
||||||
, skf_cv = skf_cv)
|
, skf_cv = skf_cv)
|
||||||
sm_all = pd.DataFrame(mm_skf_scoresD2)
|
sm_all = pd.DataFrame(mm_skf_scoresD2)
|
||||||
sm_df_CT = sm_all.filter(like='test_', axis=0)
|
sm_df_CT = sm_all.filter(like='test_', axis=0)
|
||||||
|
sm_df_BT = sm_all.filter(like='bts_', axis=0)
|
||||||
|
|
||||||
#%%
|
#%%
|
||||||
mm_skf_scoresD3 = MultClassPipeSKFCV(input_df = X_ros
|
mm_skf_scoresD3 = MultClassPipeSKFCV(input_df = X_ros
|
||||||
, target = y_ros
|
, target = y_ros
|
||||||
, var_type = 'mixed'
|
, var_type = 'mixed'
|
||||||
, skf_cv = skf_cv)
|
, skf_cv = rskf_cv
|
||||||
|
, blind_test_input_df = X_bts
|
||||||
|
, blind_test_target = y_bts)
|
||||||
ros_all = pd.DataFrame(mm_skf_scoresD3)
|
ros_all = pd.DataFrame(mm_skf_scoresD3)
|
||||||
ros_CT = ros_all.filter(like='test_', axis=0)
|
ros_CT = ros_all.filter(like='test_', axis=0)
|
||||||
|
ros_BT = ros_all.filter(like='bts_', axis=0)
|
||||||
|
#--------- combined
|
||||||
|
mm_skf_scoresD3v2 = MultClassPipeSKFCV(input_df = X_rouC
|
||||||
|
, target = y_rouC
|
||||||
|
, var_type = 'mixed'
|
||||||
|
, skf_cv = rskf_cv
|
||||||
|
, blind_test_input_df = X_bts
|
||||||
|
, blind_test_target = y_bts)
|
||||||
|
rouC_all = pd.DataFrame(mm_skf_scoresD3v2)
|
||||||
|
rouC_CT = ros_all.filter(like='test_', axis=0)
|
||||||
|
rouC_BT = ros_all.filter(like='bts_', axis=0)
|
||||||
|
|
||||||
|
|
||||||
#%%
|
#%%
|
||||||
mm_skf_scoresD4 = MultClassPipeSKFCV(input_df = X_rus
|
mm_skf_scoresD4 = MultClassPipeSKFCV(input_df = X_rus
|
||||||
, target = y_rus
|
, target = y_rus
|
||||||
, var_type = 'mixed'
|
, var_type = 'numerical'
|
||||||
, skf_cv = skf_cv)
|
, skf_cv = rskf_cv
|
||||||
|
, blind_test_input_df = X_bts
|
||||||
|
, blind_test_target = y_bts)
|
||||||
rus_all = pd.DataFrame(mm_skf_scoresD4)
|
rus_all = pd.DataFrame(mm_skf_scoresD4)
|
||||||
rus_CT = rus_all.filter(like='test_', axis=0)
|
rus_CT = rus_all.filter(like='test_', axis=0)
|
||||||
|
rus_BT = rus_all.filter(like='bts_' , axis=0)
|
||||||
|
|
||||||
#%%
|
#%%
|
||||||
mm_skf_scoresD5 = MultClassPipeSKFCV(input_df = X_enn
|
mm_skf_scoresD5 = MultClassPipeSKFCV(input_df = X_enn
|
||||||
, target = y_enn
|
, target = y_enn
|
||||||
, var_type = 'mixed'
|
, var_type = 'numerical'
|
||||||
, skf_cv = skf_cv)
|
, skf_cv = rskf_cv
|
||||||
|
, blind_test_input_df = X_bts
|
||||||
|
, blind_test_target = y_bts)
|
||||||
enn_all = pd.DataFrame(mm_skf_scoresD5)
|
enn_all = pd.DataFrame(mm_skf_scoresD5)
|
||||||
enn_CT = enn_all.filter(like='test_', axis=0)
|
enn_CT = enn_all.filter(like='test_', axis=0)
|
||||||
|
enn_BT = enn_all.filter(like='bts_', axis=0)
|
||||||
|
#%%
|
||||||
|
mm_skf_scoresD6 = MultClassPipeSKFCV(input_df = X_renn
|
||||||
|
, target = y_renn
|
||||||
|
, var_type = 'numerical'
|
||||||
|
, skf_cv = rskf_cv
|
||||||
|
, blind_test_input_df = X_bts
|
||||||
|
, blind_test_target = y_bts)
|
||||||
|
renn_all = pd.DataFrame(mm_skf_scoresD6)
|
||||||
|
renn_CT = renn_all.filter(like='test_', axis=0)
|
||||||
|
renn_BT = renn_all.filter(like='bts_', axis=0)
|
||||||
|
|
||||||
|
#%%: with categorical values + oversampling
|
||||||
|
mm_skf_scoresD7 = MultClassPipeSKFCV(input_df = X_smnc
|
||||||
|
, target = y_smnc
|
||||||
|
, var_type = 'mixed'
|
||||||
|
, skf_cv = rskf_cv
|
||||||
|
, blind_test_input_df = X_bts
|
||||||
|
, blind_test_target = y_bts)
|
||||||
|
smnc_all = pd.DataFrame(mm_skf_scoresD7)
|
||||||
|
smnc_CT = smnc_all.filter(like='test_', axis=0)
|
||||||
|
smnc_BT = smnc_all.filter(like='bts_', axis=0)
|
||||||
|
|
||||||
|
|
|
@ -76,8 +76,8 @@ from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
|
||||||
rs = {'random_state': 42}
|
rs = {'random_state': 42}
|
||||||
njobs = {'n_jobs': 10}
|
njobs = {'n_jobs': 10}
|
||||||
|
|
||||||
scoring_fn = ({ 'fscore' : make_scorer(f1_score)
|
scoring_fn = ({ 'mcc' : make_scorer(matthews_corrcoef)
|
||||||
, 'mcc' : make_scorer(matthews_corrcoef)
|
, 'fscore' : make_scorer(f1_score)
|
||||||
, 'precision' : make_scorer(precision_score)
|
, 'precision' : make_scorer(precision_score)
|
||||||
, 'recall' : make_scorer(recall_score)
|
, 'recall' : make_scorer(recall_score)
|
||||||
, 'accuracy' : make_scorer(accuracy_score)
|
, 'accuracy' : make_scorer(accuracy_score)
|
||||||
|
@ -87,7 +87,10 @@ scoring_fn = ({ 'fscore' : make_scorer(f1_score)
|
||||||
|
|
||||||
|
|
||||||
# Multiple Classification - Model Pipeline
|
# Multiple Classification - Model Pipeline
|
||||||
def MultClassPipeSKFCV(input_df, target, skf_cv, var_type = ['numerical', 'categorical','mixed']):
|
def MultClassPipeSKFCV(input_df, target, skf_cv
|
||||||
|
, blind_test_input_df
|
||||||
|
, blind_test_target
|
||||||
|
, var_type = ['numerical', 'categorical','mixed']):
|
||||||
|
|
||||||
'''
|
'''
|
||||||
@ param input_df: input features
|
@ param input_df: input features
|
||||||
|
@ -120,8 +123,8 @@ def MultClassPipeSKFCV(input_df, target, skf_cv, var_type = ['numerical', 'categ
|
||||||
t = [('cat', OneHotEncoder(), categorical_ix)]
|
t = [('cat', OneHotEncoder(), categorical_ix)]
|
||||||
|
|
||||||
if var_type == 'mixed':
|
if var_type == 'mixed':
|
||||||
t = [('cat', OneHotEncoder(), categorical_ix)
|
t = [('num', MinMaxScaler(), numerical_ix)
|
||||||
, ('num', MinMaxScaler(), numerical_ix)]
|
, ('cat', OneHotEncoder(), categorical_ix) ]
|
||||||
|
|
||||||
col_transform = ColumnTransformer(transformers = t
|
col_transform = ColumnTransformer(transformers = t
|
||||||
, remainder='passthrough')
|
, remainder='passthrough')
|
||||||
|
@ -137,7 +140,7 @@ def MultClassPipeSKFCV(input_df, target, skf_cv, var_type = ['numerical', 'categ
|
||||||
rf = RandomForestClassifier(**rs, n_estimators = 1000 )
|
rf = RandomForestClassifier(**rs, n_estimators = 1000 )
|
||||||
rf2 = RandomForestClassifier(
|
rf2 = RandomForestClassifier(
|
||||||
min_samples_leaf = 5
|
min_samples_leaf = 5
|
||||||
, n_estimators = 100 #10
|
, n_estimators = 1000
|
||||||
, bootstrap = True
|
, bootstrap = True
|
||||||
, oob_score = True
|
, oob_score = True
|
||||||
, **njobs
|
, **njobs
|
||||||
|
@ -158,16 +161,16 @@ def MultClassPipeSKFCV(input_df, target, skf_cv, var_type = ['numerical', 'categ
|
||||||
, ('K-Nearest Neighbors', knn)
|
, ('K-Nearest Neighbors', knn)
|
||||||
, ('SVM' , svm)
|
, ('SVM' , svm)
|
||||||
, ('MLP' , mlp)
|
, ('MLP' , mlp)
|
||||||
# , ('Decision Tree' , dt)
|
, ('Decision Tree' , dt)
|
||||||
# , ('Extra Trees' , et)
|
, ('Extra Trees' , et)
|
||||||
# , ('Random Forest' , rf)
|
, ('Random Forest' , rf)
|
||||||
# , ('Naive Bayes' , nb)
|
, ('Naive Bayes' , nb)
|
||||||
# , ('Random Forest2' , rf2)
|
, ('Random Forest2' , rf2)
|
||||||
# , ('XGBoost' , xgb)
|
, ('XGBoost' , xgb)
|
||||||
# , ('LDA' , lda)
|
, ('LDA' , lda)
|
||||||
# , ('MultinomialNB' , mnb)
|
, ('MultinomialNB' , mnb)
|
||||||
# , ('PassiveAggresive' , pa)
|
, ('PassiveAggresive' , pa)
|
||||||
# , ('StochasticGDescent' , sgd)
|
, ('StochasticGDescent' , sgd)
|
||||||
]
|
]
|
||||||
|
|
||||||
mm_skf_scoresD = {}
|
mm_skf_scoresD = {}
|
||||||
|
@ -196,48 +199,41 @@ def MultClassPipeSKFCV(input_df, target, skf_cv, var_type = ['numerical', 'categ
|
||||||
#pp.pprint(mm_skf_scoresD)
|
#pp.pprint(mm_skf_scoresD)
|
||||||
|
|
||||||
#return(mm_skf_scoresD)
|
#return(mm_skf_scoresD)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#%%
|
#%%
|
||||||
#=========================
|
#=========================
|
||||||
# Blind test: BTS results
|
# Blind test: BTS results
|
||||||
#=========================
|
#=========================
|
||||||
# Build the final results with all scores for a feature selected model
|
# Build the final results with all scores for a feature selected model
|
||||||
#bts_predict = gscv_fs.predict(X_bts)
|
#bts_predict = gscv_fs.predict(blind_test_input_df)
|
||||||
model_pipeline.fit(input_df, target)
|
model_pipeline.fit(input_df, target)
|
||||||
bts_predict = model_pipeline.predict(X_bts)
|
bts_predict = model_pipeline.predict(blind_test_input_df)
|
||||||
|
|
||||||
print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, bts_predict),2))
|
bts_mcc_score = round(matthews_corrcoef(blind_test_target, bts_predict),2)
|
||||||
print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, bts_predict),2))
|
print('\nMCC on Blind test:' , bts_mcc_score)
|
||||||
bts_mcc_score = round(matthews_corrcoef(y_bts, bts_predict),2)
|
print('\nAccuracy on Blind test:', round(accuracy_score(blind_test_target, bts_predict),2))
|
||||||
|
|
||||||
# Diff b/w train and bts test scores
|
# Diff b/w train and bts test scores
|
||||||
# train_test_diff = train_bscore - bts_mcc_score
|
# train_test_diff = train_bscore - bts_mcc_score
|
||||||
# print('\nDiff b/w train and blind test score (MCC):', train_test_diff)
|
# print('\nDiff b/w train and blind test score (MCC):', train_test_diff)
|
||||||
|
|
||||||
|
|
||||||
# create a dict with all scores
|
# # create a dict with all scores
|
||||||
lr_btsD = { 'model_name': model_name
|
# lr_btsD = { 'model_name': model_name
|
||||||
, 'bts_mcc':None
|
# , 'bts_mcc':None
|
||||||
, 'bts_fscore':None
|
# , 'bts_fscore':None
|
||||||
, 'bts_precision':None
|
# , 'bts_precision':None
|
||||||
, 'bts_recall':None
|
# , 'bts_recall':None
|
||||||
, 'bts_accuracy':None
|
# , 'bts_accuracy':None
|
||||||
, 'bts_roc_auc':None
|
# , 'bts_roc_auc':None
|
||||||
, 'bts_jaccard':None}
|
# , 'bts_jaccard':None}
|
||||||
|
|
||||||
|
|
||||||
lr_btsD
|
mm_skf_scoresD[model_name]['bts_mcc'] = bts_mcc_score
|
||||||
lr_btsD['bts_mcc'] = bts_mcc_score
|
mm_skf_scoresD[model_name]['bts_fscore'] = round(f1_score(blind_test_target, bts_predict),2)
|
||||||
lr_btsD['bts_fscore'] = round(f1_score(y_bts, bts_predict),2)
|
mm_skf_scoresD[model_name]['bts_precision'] = round(precision_score(blind_test_target, bts_predict),2)
|
||||||
lr_btsD['bts_precision'] = round(precision_score(y_bts, bts_predict),2)
|
mm_skf_scoresD[model_name]['bts_recall'] = round(recall_score(blind_test_target, bts_predict),2)
|
||||||
lr_btsD['bts_recall'] = round(recall_score(y_bts, bts_predict),2)
|
mm_skf_scoresD[model_name]['bts_accuracy'] = round(accuracy_score(blind_test_target, bts_predict),2)
|
||||||
lr_btsD['bts_accuracy'] = round(accuracy_score(y_bts, bts_predict),2)
|
mm_skf_scoresD[model_name]['bts_roc_auc'] = round(roc_auc_score(blind_test_target, bts_predict),2)
|
||||||
lr_btsD['bts_roc_auc'] = round(roc_auc_score(y_bts, bts_predict),2)
|
mm_skf_scoresD[model_name]['bts_jaccard'] = round(jaccard_score(blind_test_target, bts_predict),2)
|
||||||
lr_btsD['bts_jaccard'] = round(jaccard_score(y_bts, bts_predict),2)
|
return(mm_skf_scoresD)
|
||||||
lr_btsD
|
|
||||||
|
|
||||||
return(lr_btsD)
|
|
||||||
|
|
||||||
|
|
|
@ -81,7 +81,11 @@ from sklearn.model_selection import cross_validate, cross_val_score
|
||||||
from sklearn.model_selection import RepeatedStratifiedKFold
|
from sklearn.model_selection import RepeatedStratifiedKFold
|
||||||
from sklearn.ensemble import AdaBoostClassifier
|
from sklearn.ensemble import AdaBoostClassifier
|
||||||
from imblearn.combine import SMOTEENN
|
from imblearn.combine import SMOTEENN
|
||||||
|
from imblearn.combine import SMOTETomek
|
||||||
|
|
||||||
|
from imblearn.over_sampling import SMOTENC
|
||||||
from imblearn.under_sampling import EditedNearestNeighbours
|
from imblearn.under_sampling import EditedNearestNeighbours
|
||||||
|
from imblearn.under_sampling import RepeatedEditedNearestNeighbours
|
||||||
|
|
||||||
|
|
||||||
from sklearn.model_selection import GridSearchCV
|
from sklearn.model_selection import GridSearchCV
|
||||||
|
@ -117,7 +121,8 @@ os.chdir(homedir + "/git/ML_AI_training/")
|
||||||
#from MultClassPipe import MultClassPipeline
|
#from MultClassPipe import MultClassPipeline
|
||||||
from MultClassPipe2 import MultClassPipeline2
|
from MultClassPipe2 import MultClassPipeline2
|
||||||
from loopity_loop import MultClassPipeSKFLoop
|
from loopity_loop import MultClassPipeSKFLoop
|
||||||
from MultClassPipe3 import MultClassPipeSKFCV
|
#from MultClassPipe3 import MultClassPipeSKFCV
|
||||||
|
from UQ_MultClassPipe4 import MultClassPipeSKFCV
|
||||||
|
|
||||||
gene = 'pncA'
|
gene = 'pncA'
|
||||||
drug = 'pyrazinamide'
|
drug = 'pyrazinamide'
|
||||||
|
@ -285,9 +290,9 @@ all_df_wtgt.shape
|
||||||
#------
|
#------
|
||||||
# X
|
# X
|
||||||
#------
|
#------
|
||||||
#X = all_df_wtgt[numerical_FN + categorical_FN] # training data ALL
|
X = all_df_wtgt[numerical_FN + categorical_FN] # training data ALL
|
||||||
X_bts = blind_test_df[numerical_FN + categorical_FN] # blind test data ALL
|
X_bts = blind_test_df[numerical_FN + categorical_FN] # blind test data ALL
|
||||||
X = all_df_wtgt[numerical_FN] # training numerical only
|
#X = all_df_wtgt[numerical_FN] # training numerical only
|
||||||
#X_bts = blind_test_df[numerical_FN] # blind test data numerical
|
#X_bts = blind_test_df[numerical_FN] # blind test data numerical
|
||||||
|
|
||||||
#------
|
#------
|
||||||
|
|
|
@ -27,22 +27,56 @@ from sklearn.model_selection import train_test_split, cross_validate, cross_val_
|
||||||
# Metric
|
# Metric
|
||||||
from sklearn.metrics import mean_squared_error, make_scorer, roc_auc_score, f1_score, matthews_corrcoef, accuracy_score, balanced_accuracy_score, confusion_matrix, classification_report
|
from sklearn.metrics import mean_squared_error, make_scorer, roc_auc_score, f1_score, matthews_corrcoef, accuracy_score, balanced_accuracy_score, confusion_matrix, classification_report
|
||||||
|
|
||||||
def run_all_ML(input_pd, target_label):
|
#def run_all_ML(input_pd, target_label, bts_input, bts_target, var_type):
|
||||||
|
def run_all_ML(input_pd, target_label, preprocess = True, var_type = 'numerical'):
|
||||||
|
|
||||||
#y = input_pd[target_label]
|
#y = input_pd[target_label]
|
||||||
#X = input_pd.drop(target_label,axis=1)
|
#X = input_pd.drop(target_label,axis=1)
|
||||||
y = target_label
|
y = target_label
|
||||||
X = input_pd
|
X = input_pd
|
||||||
|
# determine categorical and numerical features
|
||||||
|
numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
|
||||||
|
numerical_ix
|
||||||
|
categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns
|
||||||
|
categorical_ix
|
||||||
|
|
||||||
|
# Determine preprocessing steps ~ var_type
|
||||||
|
if var_type == 'numerical':
|
||||||
|
t = [('num', MinMaxScaler(), numerical_ix)]
|
||||||
|
|
||||||
|
if var_type == 'categorical':
|
||||||
|
t = [('cat', OneHotEncoder(), categorical_ix)]
|
||||||
|
|
||||||
|
if var_type == 'mixed':
|
||||||
|
t = [('num', MinMaxScaler(), numerical_ix)
|
||||||
|
, ('cat', OneHotEncoder(), categorical_ix)]
|
||||||
|
|
||||||
|
col_transform = ColumnTransformer(transformers = t
|
||||||
|
, remainder='passthrough')
|
||||||
result_pd = pd.DataFrame()
|
result_pd = pd.DataFrame()
|
||||||
for name, algorithm in all_estimators(type_filter="classifier"):
|
for name, algorithm in all_estimators(type_filter="classifier"):
|
||||||
try:
|
try:
|
||||||
estmator = algorithm()
|
estmator = algorithm()
|
||||||
temp_pd = pd.DataFrame()
|
temp_pd = pd.DataFrame()
|
||||||
temp_cm = pd.DataFrame()
|
temp_cm = pd.DataFrame()
|
||||||
|
|
||||||
|
# orig
|
||||||
pipe = Pipeline([
|
pipe = Pipeline([
|
||||||
("model", algorithm())
|
("model" , algorithm())
|
||||||
])
|
])
|
||||||
|
|
||||||
|
# turn on and off preprocessing
|
||||||
|
if preprocess == True:
|
||||||
|
pipe = Pipeline([
|
||||||
|
('prep' , col_transform),
|
||||||
|
("model" , algorithm())
|
||||||
|
])
|
||||||
|
else:
|
||||||
|
pipe = Pipeline([
|
||||||
|
("model" , algorithm())
|
||||||
|
])
|
||||||
|
|
||||||
|
|
||||||
y_pred = cross_val_predict(pipe, X, y, cv = 10, n_jobs=10)
|
y_pred = cross_val_predict(pipe, X, y, cv = 10, n_jobs=10)
|
||||||
_mcc = round(matthews_corrcoef(y_pred, y), 3)
|
_mcc = round(matthews_corrcoef(y_pred, y), 3)
|
||||||
_bacc = round(balanced_accuracy_score(y_pred, y), 3)
|
_bacc = round(balanced_accuracy_score(y_pred, y), 3)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue