diff --git a/SKF_SSF.txt b/SKF_SSF.txt new file mode 100644 index 0000000..77f45e1 --- /dev/null +++ b/SKF_SSF.txt @@ -0,0 +1,48 @@ +# Stratified K-fold vs ShuffleSplit + +https://stackoverflow.com/questions/45969390/difference-between-stratifiedkfold-and-stratifiedshufflesplit-in-sklearn + +In ShuffleSplit, the data is shuffled every time, and then split. This means the test sets may overlap between the splits. +In SKF, test sets don't overlap + +So, the difference here is that StratifiedKFold just shuffles and splits once, therefore the test sets do not overlap, while StratifiedShuffleSplit shuffles each time before splitting, and it splits n_splits times, the test sets can overlap. + +Note: the two methods uses "stratified fold" (that why "stratified" appears in both names). It means each part preserves the same percentage of samples of each class (label) as the original data. You can read more at cross_validation documents + + +''' python code ''' +splits = 5 + +tx = range(10) +ty = [0] * 5 + [1] * 5 + +from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold +from sklearn import datasets + +kfold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=42) +shufflesplit = StratifiedShuffleSplit(n_splits=splits, random_state=42, test_size=2) + +print("KFold") +for train_index, test_index in kfold.split(tx, ty): + print("TRAIN:", train_index, "TEST:", test_index) + +print("Shuffle Split") +for train_index, test_index in shufflesplit.split(tx, ty): + print("TRAIN:", train_index, "TEST:", test_index) + +''' +Output: + +KFold +TRAIN: [0 2 3 4 5 6 7 9] TEST: [1 8] +TRAIN: [0 1 2 3 5 7 8 9] TEST: [4 6] +TRAIN: [0 1 3 4 5 6 8 9] TEST: [2 7] +TRAIN: [1 2 3 4 6 7 8 9] TEST: [0 5] +TRAIN: [0 1 2 4 5 6 7 8] TEST: [3 9] + +Shuffle Split +TRAIN: [8 4 1 0 6 5 7 2] TEST: [3 9] +TRAIN: [7 0 3 9 4 5 1 6] TEST: [8 2] +TRAIN: [1 2 5 6 4 8 9 0] TEST: [3 7] +TRAIN: [4 6 7 8 3 5 1 2] TEST: [9 0] +TRAIN: [7 2 6 5 4 3 0 9] TEST: [1 8] diff --git a/__pycache__/MultClassPipe2.cpython-37.pyc b/__pycache__/MultClassPipe2.cpython-37.pyc new file mode 100644 index 0000000..1cb8e8b Binary files /dev/null and b/__pycache__/MultClassPipe2.cpython-37.pyc differ diff --git a/__pycache__/MultClassPipe3.cpython-37.pyc b/__pycache__/MultClassPipe3.cpython-37.pyc new file mode 100644 index 0000000..2777a29 Binary files /dev/null and b/__pycache__/MultClassPipe3.cpython-37.pyc differ diff --git a/__pycache__/loopity_loop.cpython-37.pyc b/__pycache__/loopity_loop.cpython-37.pyc new file mode 100644 index 0000000..effcb8e Binary files /dev/null and b/__pycache__/loopity_loop.cpython-37.pyc differ diff --git a/comp_results b/comp_results new file mode 100644 index 0000000..9fd5e70 --- /dev/null +++ b/comp_results @@ -0,0 +1,29 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Thu Mar 10 10:59:36 2022 + +@author: tanu +""" +# numerical +#log_reg (rs) +F1_score 0.713380 +MCC 0.376546 +Precision 0.687628 +Recall 0.747231 +Accuracy 0.687293 +ROC_curve 0.683199 +#log_reg (balanced) +F1_score 0.715106 +MCC 0.390225 +Precision 0.702629 +Recall 0.733445 +Accuracy 0.694309 +ROC_curve 0.691555 +#log_reg (unbalanced) +F1_score 0.713380 +MCC 0.376546 +Precision 0.687628 +Recall 0.747231 +Accuracy 0.687293 +ROC_curve 0.683199 \ No newline at end of file diff --git a/imports.py b/imports.py index 2eaf070..ab3606c 100644 --- a/imports.py +++ b/imports.py @@ -50,10 +50,10 @@ os.chdir(homedir + "/git/ML_AI_training/") # my function from MultClassPipe import MultClassPipeline from MultClassPipe2 import MultClassPipeline2 -from MultClassPipe3 import MultClassPipeSKF +from loopity_loop import MultClassPipeSKF -gene = 'pncA' -drug = 'pyrazinamide' +gene = 'rpoB' +drug = 'rifampicin' #============== # directories @@ -82,12 +82,19 @@ mycols = my_df.columns my_df['active_aa_pos'].dtype my_df['active_aa_pos'] = my_df['active_aa_pos'].astype(object) +if gene.lower() in geneL_na_ppi2: + x_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] + #D1148 get rid of + na_index = my_df['mutationinformation'].index[my_df['mcsm_na_affinity'].apply(np.isnan)] + my_df = my_df.drop(index=na_index) + #%%============================================================================ # GET Y # Target1: mutation_info_labels dm_om_map = {'DM': 1, 'OM': 0} target1 = my_df['mutation_info_labels'].map(dm_om_map) +target1.value_counts() # Target2: drug drug_labels = drug + '_labels' diff --git a/loopity_detangle.py b/loopity_detangle.py new file mode 100644 index 0000000..56f6999 --- /dev/null +++ b/loopity_detangle.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Thu Mar 10 18:06:34 2022 + +@author: tanu +""" + +#%% +models = [ + ('Logistic Regression' , log_reg) + , ('K-Nearest Neighbors', knn) + ] + +classification_metrics = { + 'F1_score': [] + ,'MCC': [] + ,'Precision': [] + ,'Recall': [] + ,'Accuracy': [] + ,'ROC_curve': [] + } + +folds=[1,2] +fold_no=1 +fold_dict={} +for model_name, model in models: + fold_dict.update({model_name: {}}) + +for f in folds: + fold=("fold_"+str(fold_no)) + for model_name, model in models: + print("start of model", model_name, "fold: ", fold) + fold_dict[model_name].update({fold: {}}) + fold_dict[model_name][fold].update(classification_metrics) + + print("end of model", model_name, "fold: ", fold) + fold_dict[model_name][fold].update({'F1_score': random.randrange(1,10)}) + fold_no +=1 + pp.pprint(fold_dict) + + +#%% +folds_f1=[] + +for model_name, model in models: + print("Calculating mean for F1_score for: ", model_name) + #for key in fold_dict['Logistic Regression']: + # wrap this in a classification_metric for loop + for key in fold_dict[model_name]: + folds_f1.append(fold_dict['Logistic Regression'][key]['F1_score']) + #folds_f1.append(folds_f1) + print('key:', key, 'F1scores:', folds_f1) +mean(folds_f1) +#%% +scores_df = pd.DataFrame(columns=['Model', 'F1_Score', 'MCC', 'Precision', 'Recall', 'Accuracy', 'ROC_AUC']) + +# manually +model_name = 'Logistic Regression' +model_metric = 'F1_score' + +log_reg_f1 = [] +for key in fold_dict[model_name]: + log_reg_f1.append(fold_dict[model_name][key][model_metric]) + log_reg_f1M = mean(log_reg_f1) + print('key:', key, model_metric, ':', log_reg_f1) +print(log_reg_f1M) + +log_reg_f1df = pd.DataFrame({model_name: [log_reg_f1M]}, index = [model_metric]) +log_reg_f1df + +#%% +model_metric = 'MCC' +log_reg_mcc = [] +for key in fold_dict[model_name]: + log_reg_mcc.append(fold_dict[model_name][key][model_metric]) + log_reg_mccM = mean(log_reg_mcc) + print('key:', key, model_metric, ':', log_reg_mcc) +print(log_reg_mccM) + +log_reg_mccdf = pd.DataFrame({model_name: [log_reg_mccM]}, index = [model_metric]) +log_reg_mccdf \ No newline at end of file diff --git a/loopity_loop.py b/loopity_loop.py index 936cc6d..17fd851 100644 --- a/loopity_loop.py +++ b/loopity_loop.py @@ -97,13 +97,13 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical' ,'Precision': [] ,'Recall': [] ,'Accuracy': [] - ,'ROC_curve': [] + #,'ROC_AUC': [] } models = [ ('Logistic Regression' , log_reg) - #, ('Naive Bayes' , nb) + , ('Naive Bayes' , nb) , ('K-Nearest Neighbors', knn) - # , ('SVM' , svm) + , ('SVM' , svm) # , ('MLP' , mlp) # , ('Decision Tree' , dt) # , ('Extra Trees' , et) @@ -132,10 +132,8 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical' y_train_fold, y_test_fold = y_targetF.iloc[train_index], y_targetF.iloc[test_index] #print("Fold: ", fold_no, len(train_index), len(test_index)) - # for keys in skf_dict: - for model_name, model in models: - print("start of model", model_name, " loop", fold_no) + print("\nStart of model", model_name, "\nLoop no.", fold_no) #skf_dict.update({model_name: classification_metrics }) model_pipeline = Pipeline(steps=[('prep' , col_transform) , ('classifier' , model)]) @@ -145,28 +143,39 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical' #---------------- # Model metrics #---------------- - score=f1_score(y_test_fold, y_pred_fold) - mcc = matthews_corrcoef(y_test_fold, y_pred_fold) + fscore = f1_score(y_test_fold, y_pred_fold) + mcc = matthews_corrcoef(y_test_fold, y_pred_fold) + #pres = precision_score(y_test_fold, y_pred_fold) + #recall = recall_score(y_test_fold, y_pred_fold) + pres = precision_score(y_test_fold, y_pred_fold, zero_division=0) + recall = recall_score(y_test_fold, y_pred_fold, zero_division=0) + accu = accuracy_score(y_test_fold, y_pred_fold) + #roc_auc = roc_auc_score(y_test_fold, y_pred_fold) fold=("fold_"+str(fold_no)) fold_dict[model_name].update({fold: {}}) - pp.pprint(fold_dict) - print("end of model", model_name, " loop", fold_no) + #pp.pprint(fold_dict) + print("\nEnd of model", model_name, "\nLoop no.", fold_no) fold_dict[model_name][fold].update(classification_metrics) #fold_dict[model_name][fold]['F1_score'].append(score) - fold_dict[model_name][fold].update({'F1_score': score}) - fold_dict[model_name][fold].update({'MCC': mcc}) - + fold_dict[model_name][fold].update({'F1_score' : fscore}) + fold_dict[model_name][fold].update({'MCC' : mcc}) + fold_dict[model_name][fold].update({'Precision' : pres}) + fold_dict[model_name][fold].update({'Recall' : recall}) + fold_dict[model_name][fold].update({'Accuracy' : accu}) + #fold_dict[model_name][fold].update({'ROC_AUC' : roc_auc}) + fold_no +=1 #pp.pprint(skf_dict) return(fold_dict) -t3_res = MultClassPipeSKF(input_df = numerical_features_df - , y_targetF = target1 - , var_type = 'numerical' - , skf_splits = 10) -#pp.pprint(t3_res) -#print(t3_res) +#%% CAll function +# t3_res = MultClassPipeSKF(input_df = numerical_features_df +# , y_targetF = target1 +# , var_type = 'numerical' +# , skf_splits = 10) +# pp.pprint(t3_res) +# #print(t3_res) diff --git a/loopity_loop_CALL b/loopity_loop_CALL new file mode 100644 index 0000000..4916d2b --- /dev/null +++ b/loopity_loop_CALL @@ -0,0 +1,90 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Fri Mar 11 11:15:50 2022 + +@author: tanu +""" +#%% +del(t3_res) +t3_res = MultClassPipeSKF(input_df = numerical_features_df + , y_targetF = target1 + , var_type = 'numerical' + , skf_splits = 10) +pp.pprint(t3_res) +#print(t3_res) + +#%% Manually: mean for each model, each metric +model_name = 'Logistic Regression' +model_name = 'Naive Bayes' +model_name = 'K-Nearest Neighbors' +model_name = 'SVM' + +#%% +model_metric = 'F1_score' + +log_reg_f1 = [] +for key in t3_res[model_name]: + log_reg_f1.append(t3_res[model_name][key][model_metric]) + log_reg_f1M = mean(log_reg_f1) + print('key:', key, model_metric, ':', log_reg_f1) +print(log_reg_f1M) + +log_reg_f1df = pd.DataFrame({model_name: [log_reg_f1M]}, index = [model_metric]) +log_reg_f1df + +#%% +model_metric = 'MCC' +log_reg_mcc = [] +for key in t3_res[model_name]: + log_reg_mcc.append(t3_res[model_name][key][model_metric]) + log_reg_mccM = mean(log_reg_mcc) + print('key:', key, model_metric, ':', log_reg_mcc) +print(log_reg_mccM) + +log_reg_mccdf = pd.DataFrame({model_name: [log_reg_mccM]}, index = [model_metric]) +log_reg_mccdf +#%% +model_metric = 'Precision' +log_reg_pres = [] +for key in t3_res[model_name]: + log_reg_pres.append(t3_res[model_name][key][model_metric]) + log_reg_presM = mean(log_reg_pres) + print('key:', key, model_metric, ':', log_reg_pres) +print(log_reg_presM) + +log_reg_presdf = pd.DataFrame({model_name: [log_reg_presM]}, index = [model_metric]) +log_reg_presdf +#%% +model_metric = 'Recall' +log_reg_recall = [] +for key in t3_res[model_name]: + log_reg_recall.append(t3_res[model_name][key][model_metric]) + log_reg_recallM = mean(log_reg_recall) + print('key:', key, model_metric, ':', log_reg_recall) +print(log_reg_recallM) + +log_reg_recalldf = pd.DataFrame({model_name: [log_reg_recallM]}, index = [model_metric]) +log_reg_recalldf +#%% +model_metric = 'Accuracy' +log_reg_accu = [] +for key in t3_res[model_name]: + log_reg_accu.append(t3_res[model_name][key][model_metric]) + log_reg_accuM = mean(log_reg_accu) + print('key:', key, model_metric, ':', log_reg_accu) +print(log_reg_accuM) + +log_reg_accudf = pd.DataFrame({model_name: [log_reg_accuM]}, index = [model_metric]) +log_reg_accudf +#%% +model_metric = 'ROC_AUC' +log_reg_roc_auc = [] +for key in t3_res[model_name]: + log_reg_roc_auc.append(t3_res[model_name][key][model_metric]) + log_reg_roc_aucM = mean(log_reg_roc_auc) + print('key:', key, model_metric, ':', log_reg_roc_auc) +print(log_reg_roc_aucM) + +log_reg_roc_aucdf = pd.DataFrame({model_name: [log_reg_roc_aucM]}, index = [model_metric]) +log_reg_roc_aucdf \ No newline at end of file diff --git a/my_data_gid.py b/my_data_modelpipe.py similarity index 100% rename from my_data_gid.py rename to my_data_modelpipe.py diff --git a/my_data10.py b/my_datap10.py similarity index 100% rename from my_data10.py rename to my_datap10.py diff --git a/my_data11.py b/my_datap11.py similarity index 100% rename from my_data11.py rename to my_datap11.py diff --git a/my_data5.py b/my_datap5.py similarity index 100% rename from my_data5.py rename to my_datap5.py diff --git a/my_data6.py b/my_datap6.py similarity index 100% rename from my_data6.py rename to my_datap6.py diff --git a/my_data7.py b/my_datap7.py similarity index 100% rename from my_data7.py rename to my_datap7.py diff --git a/my_data8.py b/my_datap8.py similarity index 100% rename from my_data8.py rename to my_datap8.py diff --git a/my_data9.py b/my_datap9.py similarity index 100% rename from my_data9.py rename to my_datap9.py diff --git a/untitled21.py b/untitled21.py deleted file mode 100644 index a3b23be..0000000 --- a/untitled21.py +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Thu Mar 10 18:06:34 2022 - -@author: tanu -""" -models = [ - ('Logistic Regression' , log_reg) - , ('K-Nearest Neighbors', knn) - ] - -classification_metrics = { - 'F1_score': [] - ,'MCC': [] - ,'Precision': [] - ,'Recall': [] - ,'Accuracy': [] - ,'ROC_curve': [] - } - -folds=[1,2] -fold_no=1 -fold_dict={} -for model_name, model in models: - fold_dict.update({model_name: {}}) - -for f in folds: - fold=("fold_"+str(fold_no)) - for model_name, model in models: - print("start of model", model_name, "fold: ", fold) - fold_dict[model_name].update({fold: {}}) - fold_dict[model_name][fold].update(classification_metrics) - - print("end of model", model_name, "fold: ", fold) - fold_dict[model_name][fold].update({'F1_score': random.randrange(1,10)}) - fold_no +=1 - pp.pprint(fold_dict) - - \ No newline at end of file