added loopity_loop to run multiple models with stratified k-fold, got stuck in infinite loops and nested dicts

This commit is contained in:
Tanushree Tunstall 2022-03-14 10:36:19 +00:00
parent 69d0c1b557
commit 7aead2d4f4
18 changed files with 287 additions and 62 deletions

48
SKF_SSF.txt Normal file
View file

@ -0,0 +1,48 @@
# Stratified K-fold vs ShuffleSplit
https://stackoverflow.com/questions/45969390/difference-between-stratifiedkfold-and-stratifiedshufflesplit-in-sklearn
In ShuffleSplit, the data is shuffled every time, and then split. This means the test sets may overlap between the splits.
In SKF, test sets don't overlap
So, the difference here is that StratifiedKFold just shuffles and splits once, therefore the test sets do not overlap, while StratifiedShuffleSplit shuffles each time before splitting, and it splits n_splits times, the test sets can overlap.
Note: the two methods uses "stratified fold" (that why "stratified" appears in both names). It means each part preserves the same percentage of samples of each class (label) as the original data. You can read more at cross_validation documents
''' python code '''
splits = 5
tx = range(10)
ty = [0] * 5 + [1] * 5
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
from sklearn import datasets
kfold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=42)
shufflesplit = StratifiedShuffleSplit(n_splits=splits, random_state=42, test_size=2)
print("KFold")
for train_index, test_index in kfold.split(tx, ty):
print("TRAIN:", train_index, "TEST:", test_index)
print("Shuffle Split")
for train_index, test_index in shufflesplit.split(tx, ty):
print("TRAIN:", train_index, "TEST:", test_index)
'''
Output:
KFold
TRAIN: [0 2 3 4 5 6 7 9] TEST: [1 8]
TRAIN: [0 1 2 3 5 7 8 9] TEST: [4 6]
TRAIN: [0 1 3 4 5 6 8 9] TEST: [2 7]
TRAIN: [1 2 3 4 6 7 8 9] TEST: [0 5]
TRAIN: [0 1 2 4 5 6 7 8] TEST: [3 9]
Shuffle Split
TRAIN: [8 4 1 0 6 5 7 2] TEST: [3 9]
TRAIN: [7 0 3 9 4 5 1 6] TEST: [8 2]
TRAIN: [1 2 5 6 4 8 9 0] TEST: [3 7]
TRAIN: [4 6 7 8 3 5 1 2] TEST: [9 0]
TRAIN: [7 2 6 5 4 3 0 9] TEST: [1 8]

Binary file not shown.

Binary file not shown.

Binary file not shown.

29
comp_results Normal file
View file

@ -0,0 +1,29 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 10 10:59:36 2022
@author: tanu
"""
# numerical
#log_reg (rs)
F1_score 0.713380
MCC 0.376546
Precision 0.687628
Recall 0.747231
Accuracy 0.687293
ROC_curve 0.683199
#log_reg (balanced)
F1_score 0.715106
MCC 0.390225
Precision 0.702629
Recall 0.733445
Accuracy 0.694309
ROC_curve 0.691555
#log_reg (unbalanced)
F1_score 0.713380
MCC 0.376546
Precision 0.687628
Recall 0.747231
Accuracy 0.687293
ROC_curve 0.683199

View file

@ -50,10 +50,10 @@ os.chdir(homedir + "/git/ML_AI_training/")
# my function
from MultClassPipe import MultClassPipeline
from MultClassPipe2 import MultClassPipeline2
from MultClassPipe3 import MultClassPipeSKF
from loopity_loop import MultClassPipeSKF
gene = 'pncA'
drug = 'pyrazinamide'
gene = 'rpoB'
drug = 'rifampicin'
#==============
# directories
@ -82,12 +82,19 @@ mycols = my_df.columns
my_df['active_aa_pos'].dtype
my_df['active_aa_pos'] = my_df['active_aa_pos'].astype(object)
if gene.lower() in geneL_na_ppi2:
x_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist']
#D1148 get rid of
na_index = my_df['mutationinformation'].index[my_df['mcsm_na_affinity'].apply(np.isnan)]
my_df = my_df.drop(index=na_index)
#%%============================================================================
# GET Y
# Target1: mutation_info_labels
dm_om_map = {'DM': 1, 'OM': 0}
target1 = my_df['mutation_info_labels'].map(dm_om_map)
target1.value_counts()
# Target2: drug
drug_labels = drug + '_labels'

82
loopity_detangle.py Normal file
View file

@ -0,0 +1,82 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 10 18:06:34 2022
@author: tanu
"""
#%%
models = [
('Logistic Regression' , log_reg)
, ('K-Nearest Neighbors', knn)
]
classification_metrics = {
'F1_score': []
,'MCC': []
,'Precision': []
,'Recall': []
,'Accuracy': []
,'ROC_curve': []
}
folds=[1,2]
fold_no=1
fold_dict={}
for model_name, model in models:
fold_dict.update({model_name: {}})
for f in folds:
fold=("fold_"+str(fold_no))
for model_name, model in models:
print("start of model", model_name, "fold: ", fold)
fold_dict[model_name].update({fold: {}})
fold_dict[model_name][fold].update(classification_metrics)
print("end of model", model_name, "fold: ", fold)
fold_dict[model_name][fold].update({'F1_score': random.randrange(1,10)})
fold_no +=1
pp.pprint(fold_dict)
#%%
folds_f1=[]
for model_name, model in models:
print("Calculating mean for F1_score for: ", model_name)
#for key in fold_dict['Logistic Regression']:
# wrap this in a classification_metric for loop
for key in fold_dict[model_name]:
folds_f1.append(fold_dict['Logistic Regression'][key]['F1_score'])
#folds_f1.append(folds_f1)
print('key:', key, 'F1scores:', folds_f1)
mean(folds_f1)
#%%
scores_df = pd.DataFrame(columns=['Model', 'F1_Score', 'MCC', 'Precision', 'Recall', 'Accuracy', 'ROC_AUC'])
# manually
model_name = 'Logistic Regression'
model_metric = 'F1_score'
log_reg_f1 = []
for key in fold_dict[model_name]:
log_reg_f1.append(fold_dict[model_name][key][model_metric])
log_reg_f1M = mean(log_reg_f1)
print('key:', key, model_metric, ':', log_reg_f1)
print(log_reg_f1M)
log_reg_f1df = pd.DataFrame({model_name: [log_reg_f1M]}, index = [model_metric])
log_reg_f1df
#%%
model_metric = 'MCC'
log_reg_mcc = []
for key in fold_dict[model_name]:
log_reg_mcc.append(fold_dict[model_name][key][model_metric])
log_reg_mccM = mean(log_reg_mcc)
print('key:', key, model_metric, ':', log_reg_mcc)
print(log_reg_mccM)
log_reg_mccdf = pd.DataFrame({model_name: [log_reg_mccM]}, index = [model_metric])
log_reg_mccdf

View file

@ -97,13 +97,13 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical'
,'Precision': []
,'Recall': []
,'Accuracy': []
,'ROC_curve': []
#,'ROC_AUC': []
}
models = [
('Logistic Regression' , log_reg)
#, ('Naive Bayes' , nb)
, ('Naive Bayes' , nb)
, ('K-Nearest Neighbors', knn)
# , ('SVM' , svm)
, ('SVM' , svm)
# , ('MLP' , mlp)
# , ('Decision Tree' , dt)
# , ('Extra Trees' , et)
@ -132,10 +132,8 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical'
y_train_fold, y_test_fold = y_targetF.iloc[train_index], y_targetF.iloc[test_index]
#print("Fold: ", fold_no, len(train_index), len(test_index))
# for keys in skf_dict:
for model_name, model in models:
print("start of model", model_name, " loop", fold_no)
print("\nStart of model", model_name, "\nLoop no.", fold_no)
#skf_dict.update({model_name: classification_metrics })
model_pipeline = Pipeline(steps=[('prep' , col_transform)
, ('classifier' , model)])
@ -145,28 +143,39 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical'
#----------------
# Model metrics
#----------------
score=f1_score(y_test_fold, y_pred_fold)
mcc = matthews_corrcoef(y_test_fold, y_pred_fold)
fscore = f1_score(y_test_fold, y_pred_fold)
mcc = matthews_corrcoef(y_test_fold, y_pred_fold)
#pres = precision_score(y_test_fold, y_pred_fold)
#recall = recall_score(y_test_fold, y_pred_fold)
pres = precision_score(y_test_fold, y_pred_fold, zero_division=0)
recall = recall_score(y_test_fold, y_pred_fold, zero_division=0)
accu = accuracy_score(y_test_fold, y_pred_fold)
#roc_auc = roc_auc_score(y_test_fold, y_pred_fold)
fold=("fold_"+str(fold_no))
fold_dict[model_name].update({fold: {}})
pp.pprint(fold_dict)
print("end of model", model_name, " loop", fold_no)
#pp.pprint(fold_dict)
print("\nEnd of model", model_name, "\nLoop no.", fold_no)
fold_dict[model_name][fold].update(classification_metrics)
#fold_dict[model_name][fold]['F1_score'].append(score)
fold_dict[model_name][fold].update({'F1_score': score})
fold_dict[model_name][fold].update({'MCC': mcc})
fold_dict[model_name][fold].update({'F1_score' : fscore})
fold_dict[model_name][fold].update({'MCC' : mcc})
fold_dict[model_name][fold].update({'Precision' : pres})
fold_dict[model_name][fold].update({'Recall' : recall})
fold_dict[model_name][fold].update({'Accuracy' : accu})
#fold_dict[model_name][fold].update({'ROC_AUC' : roc_auc})
fold_no +=1
#pp.pprint(skf_dict)
return(fold_dict)
t3_res = MultClassPipeSKF(input_df = numerical_features_df
, y_targetF = target1
, var_type = 'numerical'
, skf_splits = 10)
#pp.pprint(t3_res)
#print(t3_res)
#%% CAll function
# t3_res = MultClassPipeSKF(input_df = numerical_features_df
# , y_targetF = target1
# , var_type = 'numerical'
# , skf_splits = 10)
# pp.pprint(t3_res)
# #print(t3_res)

90
loopity_loop_CALL Normal file
View file

@ -0,0 +1,90 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Mar 11 11:15:50 2022
@author: tanu
"""
#%%
del(t3_res)
t3_res = MultClassPipeSKF(input_df = numerical_features_df
, y_targetF = target1
, var_type = 'numerical'
, skf_splits = 10)
pp.pprint(t3_res)
#print(t3_res)
#%% Manually: mean for each model, each metric
model_name = 'Logistic Regression'
model_name = 'Naive Bayes'
model_name = 'K-Nearest Neighbors'
model_name = 'SVM'
#%%
model_metric = 'F1_score'
log_reg_f1 = []
for key in t3_res[model_name]:
log_reg_f1.append(t3_res[model_name][key][model_metric])
log_reg_f1M = mean(log_reg_f1)
print('key:', key, model_metric, ':', log_reg_f1)
print(log_reg_f1M)
log_reg_f1df = pd.DataFrame({model_name: [log_reg_f1M]}, index = [model_metric])
log_reg_f1df
#%%
model_metric = 'MCC'
log_reg_mcc = []
for key in t3_res[model_name]:
log_reg_mcc.append(t3_res[model_name][key][model_metric])
log_reg_mccM = mean(log_reg_mcc)
print('key:', key, model_metric, ':', log_reg_mcc)
print(log_reg_mccM)
log_reg_mccdf = pd.DataFrame({model_name: [log_reg_mccM]}, index = [model_metric])
log_reg_mccdf
#%%
model_metric = 'Precision'
log_reg_pres = []
for key in t3_res[model_name]:
log_reg_pres.append(t3_res[model_name][key][model_metric])
log_reg_presM = mean(log_reg_pres)
print('key:', key, model_metric, ':', log_reg_pres)
print(log_reg_presM)
log_reg_presdf = pd.DataFrame({model_name: [log_reg_presM]}, index = [model_metric])
log_reg_presdf
#%%
model_metric = 'Recall'
log_reg_recall = []
for key in t3_res[model_name]:
log_reg_recall.append(t3_res[model_name][key][model_metric])
log_reg_recallM = mean(log_reg_recall)
print('key:', key, model_metric, ':', log_reg_recall)
print(log_reg_recallM)
log_reg_recalldf = pd.DataFrame({model_name: [log_reg_recallM]}, index = [model_metric])
log_reg_recalldf
#%%
model_metric = 'Accuracy'
log_reg_accu = []
for key in t3_res[model_name]:
log_reg_accu.append(t3_res[model_name][key][model_metric])
log_reg_accuM = mean(log_reg_accu)
print('key:', key, model_metric, ':', log_reg_accu)
print(log_reg_accuM)
log_reg_accudf = pd.DataFrame({model_name: [log_reg_accuM]}, index = [model_metric])
log_reg_accudf
#%%
model_metric = 'ROC_AUC'
log_reg_roc_auc = []
for key in t3_res[model_name]:
log_reg_roc_auc.append(t3_res[model_name][key][model_metric])
log_reg_roc_aucM = mean(log_reg_roc_auc)
print('key:', key, model_metric, ':', log_reg_roc_auc)
print(log_reg_roc_aucM)
log_reg_roc_aucdf = pd.DataFrame({model_name: [log_reg_roc_aucM]}, index = [model_metric])
log_reg_roc_aucdf

View file

@ -1,40 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 10 18:06:34 2022
@author: tanu
"""
models = [
('Logistic Regression' , log_reg)
, ('K-Nearest Neighbors', knn)
]
classification_metrics = {
'F1_score': []
,'MCC': []
,'Precision': []
,'Recall': []
,'Accuracy': []
,'ROC_curve': []
}
folds=[1,2]
fold_no=1
fold_dict={}
for model_name, model in models:
fold_dict.update({model_name: {}})
for f in folds:
fold=("fold_"+str(fold_no))
for model_name, model in models:
print("start of model", model_name, "fold: ", fold)
fold_dict[model_name].update({fold: {}})
fold_dict[model_name][fold].update(classification_metrics)
print("end of model", model_name, "fold: ", fold)
fold_dict[model_name][fold].update({'F1_score': random.randrange(1,10)})
fold_no +=1
pp.pprint(fold_dict)