added loopity_loop to run multiple models with stratified k-fold, got stuck in infinite loops and nested dicts
This commit is contained in:
parent
69d0c1b557
commit
7aead2d4f4
18 changed files with 287 additions and 62 deletions
48
SKF_SSF.txt
Normal file
48
SKF_SSF.txt
Normal file
|
@ -0,0 +1,48 @@
|
|||
# Stratified K-fold vs ShuffleSplit
|
||||
|
||||
https://stackoverflow.com/questions/45969390/difference-between-stratifiedkfold-and-stratifiedshufflesplit-in-sklearn
|
||||
|
||||
In ShuffleSplit, the data is shuffled every time, and then split. This means the test sets may overlap between the splits.
|
||||
In SKF, test sets don't overlap
|
||||
|
||||
So, the difference here is that StratifiedKFold just shuffles and splits once, therefore the test sets do not overlap, while StratifiedShuffleSplit shuffles each time before splitting, and it splits n_splits times, the test sets can overlap.
|
||||
|
||||
Note: the two methods uses "stratified fold" (that why "stratified" appears in both names). It means each part preserves the same percentage of samples of each class (label) as the original data. You can read more at cross_validation documents
|
||||
|
||||
|
||||
''' python code '''
|
||||
splits = 5
|
||||
|
||||
tx = range(10)
|
||||
ty = [0] * 5 + [1] * 5
|
||||
|
||||
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
|
||||
from sklearn import datasets
|
||||
|
||||
kfold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=42)
|
||||
shufflesplit = StratifiedShuffleSplit(n_splits=splits, random_state=42, test_size=2)
|
||||
|
||||
print("KFold")
|
||||
for train_index, test_index in kfold.split(tx, ty):
|
||||
print("TRAIN:", train_index, "TEST:", test_index)
|
||||
|
||||
print("Shuffle Split")
|
||||
for train_index, test_index in shufflesplit.split(tx, ty):
|
||||
print("TRAIN:", train_index, "TEST:", test_index)
|
||||
|
||||
'''
|
||||
Output:
|
||||
|
||||
KFold
|
||||
TRAIN: [0 2 3 4 5 6 7 9] TEST: [1 8]
|
||||
TRAIN: [0 1 2 3 5 7 8 9] TEST: [4 6]
|
||||
TRAIN: [0 1 3 4 5 6 8 9] TEST: [2 7]
|
||||
TRAIN: [1 2 3 4 6 7 8 9] TEST: [0 5]
|
||||
TRAIN: [0 1 2 4 5 6 7 8] TEST: [3 9]
|
||||
|
||||
Shuffle Split
|
||||
TRAIN: [8 4 1 0 6 5 7 2] TEST: [3 9]
|
||||
TRAIN: [7 0 3 9 4 5 1 6] TEST: [8 2]
|
||||
TRAIN: [1 2 5 6 4 8 9 0] TEST: [3 7]
|
||||
TRAIN: [4 6 7 8 3 5 1 2] TEST: [9 0]
|
||||
TRAIN: [7 2 6 5 4 3 0 9] TEST: [1 8]
|
BIN
__pycache__/MultClassPipe2.cpython-37.pyc
Normal file
BIN
__pycache__/MultClassPipe2.cpython-37.pyc
Normal file
Binary file not shown.
BIN
__pycache__/MultClassPipe3.cpython-37.pyc
Normal file
BIN
__pycache__/MultClassPipe3.cpython-37.pyc
Normal file
Binary file not shown.
BIN
__pycache__/loopity_loop.cpython-37.pyc
Normal file
BIN
__pycache__/loopity_loop.cpython-37.pyc
Normal file
Binary file not shown.
29
comp_results
Normal file
29
comp_results
Normal file
|
@ -0,0 +1,29 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Thu Mar 10 10:59:36 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
# numerical
|
||||
#log_reg (rs)
|
||||
F1_score 0.713380
|
||||
MCC 0.376546
|
||||
Precision 0.687628
|
||||
Recall 0.747231
|
||||
Accuracy 0.687293
|
||||
ROC_curve 0.683199
|
||||
#log_reg (balanced)
|
||||
F1_score 0.715106
|
||||
MCC 0.390225
|
||||
Precision 0.702629
|
||||
Recall 0.733445
|
||||
Accuracy 0.694309
|
||||
ROC_curve 0.691555
|
||||
#log_reg (unbalanced)
|
||||
F1_score 0.713380
|
||||
MCC 0.376546
|
||||
Precision 0.687628
|
||||
Recall 0.747231
|
||||
Accuracy 0.687293
|
||||
ROC_curve 0.683199
|
13
imports.py
13
imports.py
|
@ -50,10 +50,10 @@ os.chdir(homedir + "/git/ML_AI_training/")
|
|||
# my function
|
||||
from MultClassPipe import MultClassPipeline
|
||||
from MultClassPipe2 import MultClassPipeline2
|
||||
from MultClassPipe3 import MultClassPipeSKF
|
||||
from loopity_loop import MultClassPipeSKF
|
||||
|
||||
gene = 'pncA'
|
||||
drug = 'pyrazinamide'
|
||||
gene = 'rpoB'
|
||||
drug = 'rifampicin'
|
||||
|
||||
#==============
|
||||
# directories
|
||||
|
@ -82,12 +82,19 @@ mycols = my_df.columns
|
|||
my_df['active_aa_pos'].dtype
|
||||
my_df['active_aa_pos'] = my_df['active_aa_pos'].astype(object)
|
||||
|
||||
if gene.lower() in geneL_na_ppi2:
|
||||
x_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist']
|
||||
#D1148 get rid of
|
||||
na_index = my_df['mutationinformation'].index[my_df['mcsm_na_affinity'].apply(np.isnan)]
|
||||
my_df = my_df.drop(index=na_index)
|
||||
|
||||
#%%============================================================================
|
||||
# GET Y
|
||||
|
||||
# Target1: mutation_info_labels
|
||||
dm_om_map = {'DM': 1, 'OM': 0}
|
||||
target1 = my_df['mutation_info_labels'].map(dm_om_map)
|
||||
target1.value_counts()
|
||||
|
||||
# Target2: drug
|
||||
drug_labels = drug + '_labels'
|
||||
|
|
82
loopity_detangle.py
Normal file
82
loopity_detangle.py
Normal file
|
@ -0,0 +1,82 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Thu Mar 10 18:06:34 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
|
||||
#%%
|
||||
models = [
|
||||
('Logistic Regression' , log_reg)
|
||||
, ('K-Nearest Neighbors', knn)
|
||||
]
|
||||
|
||||
classification_metrics = {
|
||||
'F1_score': []
|
||||
,'MCC': []
|
||||
,'Precision': []
|
||||
,'Recall': []
|
||||
,'Accuracy': []
|
||||
,'ROC_curve': []
|
||||
}
|
||||
|
||||
folds=[1,2]
|
||||
fold_no=1
|
||||
fold_dict={}
|
||||
for model_name, model in models:
|
||||
fold_dict.update({model_name: {}})
|
||||
|
||||
for f in folds:
|
||||
fold=("fold_"+str(fold_no))
|
||||
for model_name, model in models:
|
||||
print("start of model", model_name, "fold: ", fold)
|
||||
fold_dict[model_name].update({fold: {}})
|
||||
fold_dict[model_name][fold].update(classification_metrics)
|
||||
|
||||
print("end of model", model_name, "fold: ", fold)
|
||||
fold_dict[model_name][fold].update({'F1_score': random.randrange(1,10)})
|
||||
fold_no +=1
|
||||
pp.pprint(fold_dict)
|
||||
|
||||
|
||||
#%%
|
||||
folds_f1=[]
|
||||
|
||||
for model_name, model in models:
|
||||
print("Calculating mean for F1_score for: ", model_name)
|
||||
#for key in fold_dict['Logistic Regression']:
|
||||
# wrap this in a classification_metric for loop
|
||||
for key in fold_dict[model_name]:
|
||||
folds_f1.append(fold_dict['Logistic Regression'][key]['F1_score'])
|
||||
#folds_f1.append(folds_f1)
|
||||
print('key:', key, 'F1scores:', folds_f1)
|
||||
mean(folds_f1)
|
||||
#%%
|
||||
scores_df = pd.DataFrame(columns=['Model', 'F1_Score', 'MCC', 'Precision', 'Recall', 'Accuracy', 'ROC_AUC'])
|
||||
|
||||
# manually
|
||||
model_name = 'Logistic Regression'
|
||||
model_metric = 'F1_score'
|
||||
|
||||
log_reg_f1 = []
|
||||
for key in fold_dict[model_name]:
|
||||
log_reg_f1.append(fold_dict[model_name][key][model_metric])
|
||||
log_reg_f1M = mean(log_reg_f1)
|
||||
print('key:', key, model_metric, ':', log_reg_f1)
|
||||
print(log_reg_f1M)
|
||||
|
||||
log_reg_f1df = pd.DataFrame({model_name: [log_reg_f1M]}, index = [model_metric])
|
||||
log_reg_f1df
|
||||
|
||||
#%%
|
||||
model_metric = 'MCC'
|
||||
log_reg_mcc = []
|
||||
for key in fold_dict[model_name]:
|
||||
log_reg_mcc.append(fold_dict[model_name][key][model_metric])
|
||||
log_reg_mccM = mean(log_reg_mcc)
|
||||
print('key:', key, model_metric, ':', log_reg_mcc)
|
||||
print(log_reg_mccM)
|
||||
|
||||
log_reg_mccdf = pd.DataFrame({model_name: [log_reg_mccM]}, index = [model_metric])
|
||||
log_reg_mccdf
|
|
@ -97,13 +97,13 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical'
|
|||
,'Precision': []
|
||||
,'Recall': []
|
||||
,'Accuracy': []
|
||||
,'ROC_curve': []
|
||||
#,'ROC_AUC': []
|
||||
}
|
||||
models = [
|
||||
('Logistic Regression' , log_reg)
|
||||
#, ('Naive Bayes' , nb)
|
||||
, ('Naive Bayes' , nb)
|
||||
, ('K-Nearest Neighbors', knn)
|
||||
# , ('SVM' , svm)
|
||||
, ('SVM' , svm)
|
||||
# , ('MLP' , mlp)
|
||||
# , ('Decision Tree' , dt)
|
||||
# , ('Extra Trees' , et)
|
||||
|
@ -132,10 +132,8 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical'
|
|||
y_train_fold, y_test_fold = y_targetF.iloc[train_index], y_targetF.iloc[test_index]
|
||||
#print("Fold: ", fold_no, len(train_index), len(test_index))
|
||||
|
||||
# for keys in skf_dict:
|
||||
|
||||
for model_name, model in models:
|
||||
print("start of model", model_name, " loop", fold_no)
|
||||
print("\nStart of model", model_name, "\nLoop no.", fold_no)
|
||||
#skf_dict.update({model_name: classification_metrics })
|
||||
model_pipeline = Pipeline(steps=[('prep' , col_transform)
|
||||
, ('classifier' , model)])
|
||||
|
@ -145,28 +143,39 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical'
|
|||
#----------------
|
||||
# Model metrics
|
||||
#----------------
|
||||
score=f1_score(y_test_fold, y_pred_fold)
|
||||
mcc = matthews_corrcoef(y_test_fold, y_pred_fold)
|
||||
fscore = f1_score(y_test_fold, y_pred_fold)
|
||||
mcc = matthews_corrcoef(y_test_fold, y_pred_fold)
|
||||
#pres = precision_score(y_test_fold, y_pred_fold)
|
||||
#recall = recall_score(y_test_fold, y_pred_fold)
|
||||
pres = precision_score(y_test_fold, y_pred_fold, zero_division=0)
|
||||
recall = recall_score(y_test_fold, y_pred_fold, zero_division=0)
|
||||
accu = accuracy_score(y_test_fold, y_pred_fold)
|
||||
#roc_auc = roc_auc_score(y_test_fold, y_pred_fold)
|
||||
|
||||
fold=("fold_"+str(fold_no))
|
||||
|
||||
fold_dict[model_name].update({fold: {}})
|
||||
pp.pprint(fold_dict)
|
||||
print("end of model", model_name, " loop", fold_no)
|
||||
#pp.pprint(fold_dict)
|
||||
print("\nEnd of model", model_name, "\nLoop no.", fold_no)
|
||||
|
||||
fold_dict[model_name][fold].update(classification_metrics)
|
||||
#fold_dict[model_name][fold]['F1_score'].append(score)
|
||||
fold_dict[model_name][fold].update({'F1_score': score})
|
||||
fold_dict[model_name][fold].update({'MCC': mcc})
|
||||
|
||||
fold_dict[model_name][fold].update({'F1_score' : fscore})
|
||||
fold_dict[model_name][fold].update({'MCC' : mcc})
|
||||
fold_dict[model_name][fold].update({'Precision' : pres})
|
||||
fold_dict[model_name][fold].update({'Recall' : recall})
|
||||
fold_dict[model_name][fold].update({'Accuracy' : accu})
|
||||
#fold_dict[model_name][fold].update({'ROC_AUC' : roc_auc})
|
||||
|
||||
fold_no +=1
|
||||
#pp.pprint(skf_dict)
|
||||
|
||||
return(fold_dict)
|
||||
|
||||
t3_res = MultClassPipeSKF(input_df = numerical_features_df
|
||||
, y_targetF = target1
|
||||
, var_type = 'numerical'
|
||||
, skf_splits = 10)
|
||||
#pp.pprint(t3_res)
|
||||
#print(t3_res)
|
||||
#%% CAll function
|
||||
# t3_res = MultClassPipeSKF(input_df = numerical_features_df
|
||||
# , y_targetF = target1
|
||||
# , var_type = 'numerical'
|
||||
# , skf_splits = 10)
|
||||
# pp.pprint(t3_res)
|
||||
# #print(t3_res)
|
||||
|
|
90
loopity_loop_CALL
Normal file
90
loopity_loop_CALL
Normal file
|
@ -0,0 +1,90 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Fri Mar 11 11:15:50 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
#%%
|
||||
del(t3_res)
|
||||
t3_res = MultClassPipeSKF(input_df = numerical_features_df
|
||||
, y_targetF = target1
|
||||
, var_type = 'numerical'
|
||||
, skf_splits = 10)
|
||||
pp.pprint(t3_res)
|
||||
#print(t3_res)
|
||||
|
||||
#%% Manually: mean for each model, each metric
|
||||
model_name = 'Logistic Regression'
|
||||
model_name = 'Naive Bayes'
|
||||
model_name = 'K-Nearest Neighbors'
|
||||
model_name = 'SVM'
|
||||
|
||||
#%%
|
||||
model_metric = 'F1_score'
|
||||
|
||||
log_reg_f1 = []
|
||||
for key in t3_res[model_name]:
|
||||
log_reg_f1.append(t3_res[model_name][key][model_metric])
|
||||
log_reg_f1M = mean(log_reg_f1)
|
||||
print('key:', key, model_metric, ':', log_reg_f1)
|
||||
print(log_reg_f1M)
|
||||
|
||||
log_reg_f1df = pd.DataFrame({model_name: [log_reg_f1M]}, index = [model_metric])
|
||||
log_reg_f1df
|
||||
|
||||
#%%
|
||||
model_metric = 'MCC'
|
||||
log_reg_mcc = []
|
||||
for key in t3_res[model_name]:
|
||||
log_reg_mcc.append(t3_res[model_name][key][model_metric])
|
||||
log_reg_mccM = mean(log_reg_mcc)
|
||||
print('key:', key, model_metric, ':', log_reg_mcc)
|
||||
print(log_reg_mccM)
|
||||
|
||||
log_reg_mccdf = pd.DataFrame({model_name: [log_reg_mccM]}, index = [model_metric])
|
||||
log_reg_mccdf
|
||||
#%%
|
||||
model_metric = 'Precision'
|
||||
log_reg_pres = []
|
||||
for key in t3_res[model_name]:
|
||||
log_reg_pres.append(t3_res[model_name][key][model_metric])
|
||||
log_reg_presM = mean(log_reg_pres)
|
||||
print('key:', key, model_metric, ':', log_reg_pres)
|
||||
print(log_reg_presM)
|
||||
|
||||
log_reg_presdf = pd.DataFrame({model_name: [log_reg_presM]}, index = [model_metric])
|
||||
log_reg_presdf
|
||||
#%%
|
||||
model_metric = 'Recall'
|
||||
log_reg_recall = []
|
||||
for key in t3_res[model_name]:
|
||||
log_reg_recall.append(t3_res[model_name][key][model_metric])
|
||||
log_reg_recallM = mean(log_reg_recall)
|
||||
print('key:', key, model_metric, ':', log_reg_recall)
|
||||
print(log_reg_recallM)
|
||||
|
||||
log_reg_recalldf = pd.DataFrame({model_name: [log_reg_recallM]}, index = [model_metric])
|
||||
log_reg_recalldf
|
||||
#%%
|
||||
model_metric = 'Accuracy'
|
||||
log_reg_accu = []
|
||||
for key in t3_res[model_name]:
|
||||
log_reg_accu.append(t3_res[model_name][key][model_metric])
|
||||
log_reg_accuM = mean(log_reg_accu)
|
||||
print('key:', key, model_metric, ':', log_reg_accu)
|
||||
print(log_reg_accuM)
|
||||
|
||||
log_reg_accudf = pd.DataFrame({model_name: [log_reg_accuM]}, index = [model_metric])
|
||||
log_reg_accudf
|
||||
#%%
|
||||
model_metric = 'ROC_AUC'
|
||||
log_reg_roc_auc = []
|
||||
for key in t3_res[model_name]:
|
||||
log_reg_roc_auc.append(t3_res[model_name][key][model_metric])
|
||||
log_reg_roc_aucM = mean(log_reg_roc_auc)
|
||||
print('key:', key, model_metric, ':', log_reg_roc_auc)
|
||||
print(log_reg_roc_aucM)
|
||||
|
||||
log_reg_roc_aucdf = pd.DataFrame({model_name: [log_reg_roc_aucM]}, index = [model_metric])
|
||||
log_reg_roc_aucdf
|
|
@ -1,40 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Thu Mar 10 18:06:34 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
models = [
|
||||
('Logistic Regression' , log_reg)
|
||||
, ('K-Nearest Neighbors', knn)
|
||||
]
|
||||
|
||||
classification_metrics = {
|
||||
'F1_score': []
|
||||
,'MCC': []
|
||||
,'Precision': []
|
||||
,'Recall': []
|
||||
,'Accuracy': []
|
||||
,'ROC_curve': []
|
||||
}
|
||||
|
||||
folds=[1,2]
|
||||
fold_no=1
|
||||
fold_dict={}
|
||||
for model_name, model in models:
|
||||
fold_dict.update({model_name: {}})
|
||||
|
||||
for f in folds:
|
||||
fold=("fold_"+str(fold_no))
|
||||
for model_name, model in models:
|
||||
print("start of model", model_name, "fold: ", fold)
|
||||
fold_dict[model_name].update({fold: {}})
|
||||
fold_dict[model_name][fold].update(classification_metrics)
|
||||
|
||||
print("end of model", model_name, "fold: ", fold)
|
||||
fold_dict[model_name][fold].update({'F1_score': random.randrange(1,10)})
|
||||
fold_no +=1
|
||||
pp.pprint(fold_dict)
|
||||
|
||||
|
Loading…
Add table
Add a link
Reference in a new issue