From bff16fc219d889d2ecb465a88d638f8da33a2c80 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Thu, 3 Mar 2022 17:59:51 +0000 Subject: [PATCH] added my_data5.py to run multiple classifications algorithms and added prelim results --- my_data5.py | 156 ++++++++++++++++++++++++++++++++++++++++++ my_data5_results_pnca | 84 +++++++++++++++++++++++ 2 files changed, 240 insertions(+) create mode 100644 my_data5.py create mode 100644 my_data5_results_pnca diff --git a/my_data5.py b/my_data5.py new file mode 100644 index 0000000..500e6ba --- /dev/null +++ b/my_data5.py @@ -0,0 +1,156 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Thu Mar 3 17:08:18 2022 + +@author: tanu +""" +from sklearn.linear_model import LogisticRegression +from sklearn.naive_bayes import BernoulliNB +from sklearn.neighbors import KNeighborsClassifier +from sklearn.svm import SVC +from sklearn.tree import DecisionTreeClassifier +from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier +from sklearn.neural_network import MLPClassifier +from sklearn.pipeline import Pipeline +from xgboost import XGBClassifier +from sklearn.preprocessing import StandardScaler +from sklearn.model_selection import train_test_split +import os +from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score +import pandas as pd +#%% +homedir = os.path.expanduser("~") +os.chdir(homedir + "/git/ML_AI_training/test_data") + +# this needs to be merged_df2 or merged_df3? +#gene 'pncA' +drug = 'pyrazinamide' + +my_df = pd.read_csv("pnca_merged_df3.csv") + +my_df.dtypes +my_df_cols = my_df.columns + +#%%============================================================================ +# GET Y +# Y = my_df.loc[:,drug] #has NA +dm_om_map = {'DM': 1, 'OM': 0} +my_df['resistance'] = my_df['mutation_info_labels'].map(dm_om_map) + +# sanity check +my_df['resistance'].value_counts() +my_df['mutation_info_labels'].value_counts() +Y = my_df['resistance'] + +# GET X +cols = my_df.columns +X_stability = my_df[['ligand_distance' + , 'ligand_affinity_change' + , 'duet_stability_change' + , 'ddg_foldx' + , 'deepddg' + , 'ddg_dynamut2']] + +X_evol = my_df[['consurf_score' + , 'snap2_score' + , 'snap2_accuracy_pc']] + +X_str = my_df[['asa' + , 'rsa' + , 'kd_values' + , 'rd_values']] + +#%% try combinations +X_vars = X_stability +X_vars = X_evol +X_vars = X_str + +X_vars = pd.concat([X_stability, X_evol, X_str], axis = 1) +X_vars = pd.concat([X_stability, X_evol], axis = 1) +X_vars = pd.concat([X_stability, X_str], axis = 1) +X_vars = pd.concat([X_evol, X_str], axis = 1) + +#%% +X_vars.shape[1] + +# TODO: stratified cross validate +# Train-test Split +rs = {'random_state': 42} +X_train, X_test, y_train, y_test = train_test_split(X_vars, + Y, + test_size = 0.33, + random_state = 42) + +# Classification - Model Pipeline +def modelPipeline(X_train, X_test, y_train, y_test): + + log_reg = LogisticRegression(**rs) + nb = BernoulliNB() + knn = KNeighborsClassifier() + svm = SVC(**rs) + mlp = MLPClassifier(max_iter=500, **rs) + dt = DecisionTreeClassifier(**rs) + et = ExtraTreesClassifier(**rs) + rf = RandomForestClassifier(**rs) + xgb = XGBClassifier(**rs, verbosity=0) + + clfs = [ + ('Logistic Regression', log_reg), + ('Naive Bayes', nb), + ('K-Nearest Neighbors', knn), + ('SVM', svm), + ('MLP', mlp), + ('Decision Tree', dt), + ('Extra Trees', et), + ('Random Forest', rf), + ('XGBoost', xgb) + ] + + + pipelines = [] + + scores_df = pd.DataFrame(columns=['Model', 'F1_Score', 'Precision', 'Recall', 'Accuracy', 'ROC_AUC']) + + + for clf_name, clf in clfs: + + pipeline = Pipeline(steps=[ + ('scaler', StandardScaler()), + ('classifier', clf) + ] + ) + pipeline.fit(X_train, y_train) + + # Model predictions + y_pred = pipeline.predict(X_test) + + # F1-Score + fscore = f1_score(y_test, y_pred) + # Precision + pres = precision_score(y_test, y_pred) + # Recall + rcall = recall_score(y_test, y_pred) + # Accuracy + accu = accuracy_score(y_test, y_pred) + # ROC_AUC + roc_auc = roc_auc_score(y_test, y_pred) + + + pipelines.append(pipeline) + + scores_df = scores_df.append({ + 'Model' : clf_name, + 'F1_Score' : fscore, + 'Precision' : pres, + 'Recall' : rcall, + 'Accuracy' : accu, + 'ROC_AUC' : roc_auc + + }, + ignore_index = True) + + return pipelines, scores_df + + +modelPipeline(X_train, X_test, y_train, y_test) \ No newline at end of file diff --git a/my_data5_results_pnca b/my_data5_results_pnca new file mode 100644 index 0000000..005c7e3 --- /dev/null +++ b/my_data5_results_pnca @@ -0,0 +1,84 @@ + # stabilty [6] + Model F1_Score Precision Recall Accuracy ROC_AUC + 0 Logistic Regression 0.738854 0.698795 0.783784 0.707143 0.702498 + 1 Naive Bayes 0.627451 0.607595 0.648649 0.592857 0.589476 + 2 K-Nearest Neighbors 0.731707 0.666667 0.810811 0.685714 0.678133 + 3 SVM 0.729412 0.645833 0.837838 0.671429 0.661343 + 4 MLP 0.670968 0.641975 0.702703 0.635714 0.631654 + 5 Decision Tree 0.653595 0.632911 0.675676 0.621429 0.618141 + 6 Extra Trees 0.733728 0.652632 0.837838 0.678571 0.668919 + 7 Random Forest 0.726190 0.648936 0.824324 0.671429 0.662162 + 8 XGBoost 0.704403 0.658824 0.756757 0.664286 0.658681) + + # evolution [3] + Model F1_Score Precision Recall Accuracy ROC_AUC + 0 Logistic Regression 0.795181 0.717391 0.891892 0.757143 0.748976 + 1 Naive Bayes 0.805031 0.752941 0.864865 0.778571 0.773342 + 2 K-Nearest Neighbors 0.735484 0.703704 0.770270 0.707143 0.703317 + 3 SVM 0.797619 0.712766 0.905405 0.757143 0.748157 + 4 MLP 0.787879 0.714286 0.878378 0.750000 0.742219 + 5 Decision Tree 0.631579 0.615385 0.648649 0.600000 0.597052 + 6 Extra Trees 0.688312 0.662500 0.716216 0.657143 0.653563 + 7 Random Forest 0.704403 0.658824 0.756757 0.664286 0.658681 + 8 XGBoost 0.713376 0.674699 0.756757 0.678571 0.673833) + +# str features [4] + Model F1_Score Precision Recall Accuracy ROC_AUC + 0 Logistic Regression 0.729412 0.645833 0.837838 0.671429 0.661343 + 1 Naive Bayes 0.723926 0.662921 0.797297 0.678571 0.671376 + 2 K-Nearest Neighbors 0.662338 0.637500 0.689189 0.628571 0.624898 + 3 SVM 0.727273 0.627451 0.864865 0.657143 0.644554 + 4 MLP 0.710843 0.641304 0.797297 0.657143 0.648649 + 5 Decision Tree 0.561151 0.600000 0.527027 0.564286 0.566544 + 6 Extra Trees 0.567376 0.597015 0.540541 0.564286 0.565725 + 7 Random Forest 0.596026 0.584416 0.608108 0.564286 0.561630 + 8 XGBoost 0.630872 0.626667 0.635135 0.607143 0.605446) + + #========================================================================= + # stability + evolution + str features [13 = 6+3+4] + Model F1_Score Precision Recall Accuracy ROC_AUC + 0 Logistic Regression 0.726115 0.686747 0.770270 0.692857 0.688165 + 1 Naive Bayes 0.730769 0.695122 0.770270 0.700000 0.695741 + 2 K-Nearest Neighbors 0.742515 0.666667 0.837838 0.692857 0.684070 + 3 SVM 0.763636 0.692308 0.851351 0.721429 0.713554 + 4 MLP 0.717949 0.682927 0.756757 0.685714 0.681409 + 5 Decision Tree 0.671429 0.712121 0.635135 0.671429 0.673628 + 6 Extra Trees 0.756410 0.719512 0.797297 0.728571 0.724406 + 7 Random Forest 0.742138 0.694118 0.797297 0.707143 0.701679 + 8 XGBoost 0.692810 0.670886 0.716216 0.664286 0.661138) + + # stability + evolution [9=6+3] + Model F1_Score Precision Recall Accuracy ROC_AUC + 0 Logistic Regression 0.729560 0.682353 0.783784 0.692857 0.687346 + 1 Naive Bayes 0.743590 0.707317 0.783784 0.714286 0.710074 + 2 K-Nearest Neighbors 0.720497 0.666667 0.783784 0.678571 0.672195 + 3 SVM 0.771084 0.695652 0.864865 0.728571 0.720311 + 4 MLP 0.679739 0.658228 0.702703 0.650000 0.646806 + 5 Decision Tree 0.620690 0.633803 0.608108 0.607143 0.607084 + 6 Extra Trees 0.727273 0.700000 0.756757 0.700000 0.696560 + 7 Random Forest 0.734177 0.690476 0.783784 0.700000 0.694922 + 8 XGBoost 0.675497 0.662338 0.689189 0.650000 0.647625) + + # stability + str features [10=6+4] + Model F1_Score Precision Recall Accuracy ROC_AUC + 0 Logistic Regression 0.750000 0.697674 0.810811 0.714286 0.708436 + 1 Naive Bayes 0.714286 0.687500 0.743243 0.685714 0.682228 + 2 K-Nearest Neighbors 0.687500 0.639535 0.743243 0.642857 0.636773 + 3 SVM 0.743902 0.677778 0.824324 0.700000 0.692465 + 4 MLP 0.716981 0.670588 0.770270 0.678571 0.673014 + 5 Decision Tree 0.616438 0.625000 0.608108 0.600000 0.599509 + 6 Extra Trees 0.697368 0.679487 0.716216 0.671429 0.668714 + 7 Random Forest 0.684211 0.666667 0.702703 0.657143 0.654382 + 8 XGBoost 0.666667 0.645570 0.689189 0.635714 0.632473) + + # evolution + str features[7=3+4] + Model F1_Score Precision Recall Accuracy ROC_AUC + 0 Logistic Regression 0.773006 0.707865 0.851351 0.735714 0.728706 + 1 Naive Bayes 0.750000 0.730769 0.770270 0.728571 0.726044 + 2 K-Nearest Neighbors 0.737500 0.686047 0.797297 0.700000 0.694103 + 3 SVM 0.763636 0.692308 0.851351 0.721429 0.713554 + 4 MLP 0.775758 0.703297 0.864865 0.735714 0.727887 + 5 Decision Tree 0.675497 0.662338 0.689189 0.650000 0.647625 + 6 Extra Trees 0.715232 0.701299 0.729730 0.692857 0.690622 + 7 Random Forest 0.715232 0.701299 0.729730 0.692857 0.690622 + 8 XGBoost 0.721519 0.678571 0.770270 0.685714 0.680590)