added my_data5.py to run multiple classifications algorithms and added prelim results
This commit is contained in:
parent
1fecbc15c9
commit
bff16fc219
2 changed files with 240 additions and 0 deletions
156
my_data5.py
Normal file
156
my_data5.py
Normal file
|
@ -0,0 +1,156 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Thu Mar 3 17:08:18 2022
|
||||||
|
|
||||||
|
@author: tanu
|
||||||
|
"""
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
from sklearn.naive_bayes import BernoulliNB
|
||||||
|
from sklearn.neighbors import KNeighborsClassifier
|
||||||
|
from sklearn.svm import SVC
|
||||||
|
from sklearn.tree import DecisionTreeClassifier
|
||||||
|
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
|
||||||
|
from sklearn.neural_network import MLPClassifier
|
||||||
|
from sklearn.pipeline import Pipeline
|
||||||
|
from xgboost import XGBClassifier
|
||||||
|
from sklearn.preprocessing import StandardScaler
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
import os
|
||||||
|
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score
|
||||||
|
import pandas as pd
|
||||||
|
#%%
|
||||||
|
homedir = os.path.expanduser("~")
|
||||||
|
os.chdir(homedir + "/git/ML_AI_training/test_data")
|
||||||
|
|
||||||
|
# this needs to be merged_df2 or merged_df3?
|
||||||
|
#gene 'pncA'
|
||||||
|
drug = 'pyrazinamide'
|
||||||
|
|
||||||
|
my_df = pd.read_csv("pnca_merged_df3.csv")
|
||||||
|
|
||||||
|
my_df.dtypes
|
||||||
|
my_df_cols = my_df.columns
|
||||||
|
|
||||||
|
#%%============================================================================
|
||||||
|
# GET Y
|
||||||
|
# Y = my_df.loc[:,drug] #has NA
|
||||||
|
dm_om_map = {'DM': 1, 'OM': 0}
|
||||||
|
my_df['resistance'] = my_df['mutation_info_labels'].map(dm_om_map)
|
||||||
|
|
||||||
|
# sanity check
|
||||||
|
my_df['resistance'].value_counts()
|
||||||
|
my_df['mutation_info_labels'].value_counts()
|
||||||
|
Y = my_df['resistance']
|
||||||
|
|
||||||
|
# GET X
|
||||||
|
cols = my_df.columns
|
||||||
|
X_stability = my_df[['ligand_distance'
|
||||||
|
, 'ligand_affinity_change'
|
||||||
|
, 'duet_stability_change'
|
||||||
|
, 'ddg_foldx'
|
||||||
|
, 'deepddg'
|
||||||
|
, 'ddg_dynamut2']]
|
||||||
|
|
||||||
|
X_evol = my_df[['consurf_score'
|
||||||
|
, 'snap2_score'
|
||||||
|
, 'snap2_accuracy_pc']]
|
||||||
|
|
||||||
|
X_str = my_df[['asa'
|
||||||
|
, 'rsa'
|
||||||
|
, 'kd_values'
|
||||||
|
, 'rd_values']]
|
||||||
|
|
||||||
|
#%% try combinations
|
||||||
|
X_vars = X_stability
|
||||||
|
X_vars = X_evol
|
||||||
|
X_vars = X_str
|
||||||
|
|
||||||
|
X_vars = pd.concat([X_stability, X_evol, X_str], axis = 1)
|
||||||
|
X_vars = pd.concat([X_stability, X_evol], axis = 1)
|
||||||
|
X_vars = pd.concat([X_stability, X_str], axis = 1)
|
||||||
|
X_vars = pd.concat([X_evol, X_str], axis = 1)
|
||||||
|
|
||||||
|
#%%
|
||||||
|
X_vars.shape[1]
|
||||||
|
|
||||||
|
# TODO: stratified cross validate
|
||||||
|
# Train-test Split
|
||||||
|
rs = {'random_state': 42}
|
||||||
|
X_train, X_test, y_train, y_test = train_test_split(X_vars,
|
||||||
|
Y,
|
||||||
|
test_size = 0.33,
|
||||||
|
random_state = 42)
|
||||||
|
|
||||||
|
# Classification - Model Pipeline
|
||||||
|
def modelPipeline(X_train, X_test, y_train, y_test):
|
||||||
|
|
||||||
|
log_reg = LogisticRegression(**rs)
|
||||||
|
nb = BernoulliNB()
|
||||||
|
knn = KNeighborsClassifier()
|
||||||
|
svm = SVC(**rs)
|
||||||
|
mlp = MLPClassifier(max_iter=500, **rs)
|
||||||
|
dt = DecisionTreeClassifier(**rs)
|
||||||
|
et = ExtraTreesClassifier(**rs)
|
||||||
|
rf = RandomForestClassifier(**rs)
|
||||||
|
xgb = XGBClassifier(**rs, verbosity=0)
|
||||||
|
|
||||||
|
clfs = [
|
||||||
|
('Logistic Regression', log_reg),
|
||||||
|
('Naive Bayes', nb),
|
||||||
|
('K-Nearest Neighbors', knn),
|
||||||
|
('SVM', svm),
|
||||||
|
('MLP', mlp),
|
||||||
|
('Decision Tree', dt),
|
||||||
|
('Extra Trees', et),
|
||||||
|
('Random Forest', rf),
|
||||||
|
('XGBoost', xgb)
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
pipelines = []
|
||||||
|
|
||||||
|
scores_df = pd.DataFrame(columns=['Model', 'F1_Score', 'Precision', 'Recall', 'Accuracy', 'ROC_AUC'])
|
||||||
|
|
||||||
|
|
||||||
|
for clf_name, clf in clfs:
|
||||||
|
|
||||||
|
pipeline = Pipeline(steps=[
|
||||||
|
('scaler', StandardScaler()),
|
||||||
|
('classifier', clf)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
pipeline.fit(X_train, y_train)
|
||||||
|
|
||||||
|
# Model predictions
|
||||||
|
y_pred = pipeline.predict(X_test)
|
||||||
|
|
||||||
|
# F1-Score
|
||||||
|
fscore = f1_score(y_test, y_pred)
|
||||||
|
# Precision
|
||||||
|
pres = precision_score(y_test, y_pred)
|
||||||
|
# Recall
|
||||||
|
rcall = recall_score(y_test, y_pred)
|
||||||
|
# Accuracy
|
||||||
|
accu = accuracy_score(y_test, y_pred)
|
||||||
|
# ROC_AUC
|
||||||
|
roc_auc = roc_auc_score(y_test, y_pred)
|
||||||
|
|
||||||
|
|
||||||
|
pipelines.append(pipeline)
|
||||||
|
|
||||||
|
scores_df = scores_df.append({
|
||||||
|
'Model' : clf_name,
|
||||||
|
'F1_Score' : fscore,
|
||||||
|
'Precision' : pres,
|
||||||
|
'Recall' : rcall,
|
||||||
|
'Accuracy' : accu,
|
||||||
|
'ROC_AUC' : roc_auc
|
||||||
|
|
||||||
|
},
|
||||||
|
ignore_index = True)
|
||||||
|
|
||||||
|
return pipelines, scores_df
|
||||||
|
|
||||||
|
|
||||||
|
modelPipeline(X_train, X_test, y_train, y_test)
|
84
my_data5_results_pnca
Normal file
84
my_data5_results_pnca
Normal file
|
@ -0,0 +1,84 @@
|
||||||
|
# stabilty [6]
|
||||||
|
Model F1_Score Precision Recall Accuracy ROC_AUC
|
||||||
|
0 Logistic Regression 0.738854 0.698795 0.783784 0.707143 0.702498
|
||||||
|
1 Naive Bayes 0.627451 0.607595 0.648649 0.592857 0.589476
|
||||||
|
2 K-Nearest Neighbors 0.731707 0.666667 0.810811 0.685714 0.678133
|
||||||
|
3 SVM 0.729412 0.645833 0.837838 0.671429 0.661343
|
||||||
|
4 MLP 0.670968 0.641975 0.702703 0.635714 0.631654
|
||||||
|
5 Decision Tree 0.653595 0.632911 0.675676 0.621429 0.618141
|
||||||
|
6 Extra Trees 0.733728 0.652632 0.837838 0.678571 0.668919
|
||||||
|
7 Random Forest 0.726190 0.648936 0.824324 0.671429 0.662162
|
||||||
|
8 XGBoost 0.704403 0.658824 0.756757 0.664286 0.658681)
|
||||||
|
|
||||||
|
# evolution [3]
|
||||||
|
Model F1_Score Precision Recall Accuracy ROC_AUC
|
||||||
|
0 Logistic Regression 0.795181 0.717391 0.891892 0.757143 0.748976
|
||||||
|
1 Naive Bayes 0.805031 0.752941 0.864865 0.778571 0.773342
|
||||||
|
2 K-Nearest Neighbors 0.735484 0.703704 0.770270 0.707143 0.703317
|
||||||
|
3 SVM 0.797619 0.712766 0.905405 0.757143 0.748157
|
||||||
|
4 MLP 0.787879 0.714286 0.878378 0.750000 0.742219
|
||||||
|
5 Decision Tree 0.631579 0.615385 0.648649 0.600000 0.597052
|
||||||
|
6 Extra Trees 0.688312 0.662500 0.716216 0.657143 0.653563
|
||||||
|
7 Random Forest 0.704403 0.658824 0.756757 0.664286 0.658681
|
||||||
|
8 XGBoost 0.713376 0.674699 0.756757 0.678571 0.673833)
|
||||||
|
|
||||||
|
# str features [4]
|
||||||
|
Model F1_Score Precision Recall Accuracy ROC_AUC
|
||||||
|
0 Logistic Regression 0.729412 0.645833 0.837838 0.671429 0.661343
|
||||||
|
1 Naive Bayes 0.723926 0.662921 0.797297 0.678571 0.671376
|
||||||
|
2 K-Nearest Neighbors 0.662338 0.637500 0.689189 0.628571 0.624898
|
||||||
|
3 SVM 0.727273 0.627451 0.864865 0.657143 0.644554
|
||||||
|
4 MLP 0.710843 0.641304 0.797297 0.657143 0.648649
|
||||||
|
5 Decision Tree 0.561151 0.600000 0.527027 0.564286 0.566544
|
||||||
|
6 Extra Trees 0.567376 0.597015 0.540541 0.564286 0.565725
|
||||||
|
7 Random Forest 0.596026 0.584416 0.608108 0.564286 0.561630
|
||||||
|
8 XGBoost 0.630872 0.626667 0.635135 0.607143 0.605446)
|
||||||
|
|
||||||
|
#=========================================================================
|
||||||
|
# stability + evolution + str features [13 = 6+3+4]
|
||||||
|
Model F1_Score Precision Recall Accuracy ROC_AUC
|
||||||
|
0 Logistic Regression 0.726115 0.686747 0.770270 0.692857 0.688165
|
||||||
|
1 Naive Bayes 0.730769 0.695122 0.770270 0.700000 0.695741
|
||||||
|
2 K-Nearest Neighbors 0.742515 0.666667 0.837838 0.692857 0.684070
|
||||||
|
3 SVM 0.763636 0.692308 0.851351 0.721429 0.713554
|
||||||
|
4 MLP 0.717949 0.682927 0.756757 0.685714 0.681409
|
||||||
|
5 Decision Tree 0.671429 0.712121 0.635135 0.671429 0.673628
|
||||||
|
6 Extra Trees 0.756410 0.719512 0.797297 0.728571 0.724406
|
||||||
|
7 Random Forest 0.742138 0.694118 0.797297 0.707143 0.701679
|
||||||
|
8 XGBoost 0.692810 0.670886 0.716216 0.664286 0.661138)
|
||||||
|
|
||||||
|
# stability + evolution [9=6+3]
|
||||||
|
Model F1_Score Precision Recall Accuracy ROC_AUC
|
||||||
|
0 Logistic Regression 0.729560 0.682353 0.783784 0.692857 0.687346
|
||||||
|
1 Naive Bayes 0.743590 0.707317 0.783784 0.714286 0.710074
|
||||||
|
2 K-Nearest Neighbors 0.720497 0.666667 0.783784 0.678571 0.672195
|
||||||
|
3 SVM 0.771084 0.695652 0.864865 0.728571 0.720311
|
||||||
|
4 MLP 0.679739 0.658228 0.702703 0.650000 0.646806
|
||||||
|
5 Decision Tree 0.620690 0.633803 0.608108 0.607143 0.607084
|
||||||
|
6 Extra Trees 0.727273 0.700000 0.756757 0.700000 0.696560
|
||||||
|
7 Random Forest 0.734177 0.690476 0.783784 0.700000 0.694922
|
||||||
|
8 XGBoost 0.675497 0.662338 0.689189 0.650000 0.647625)
|
||||||
|
|
||||||
|
# stability + str features [10=6+4]
|
||||||
|
Model F1_Score Precision Recall Accuracy ROC_AUC
|
||||||
|
0 Logistic Regression 0.750000 0.697674 0.810811 0.714286 0.708436
|
||||||
|
1 Naive Bayes 0.714286 0.687500 0.743243 0.685714 0.682228
|
||||||
|
2 K-Nearest Neighbors 0.687500 0.639535 0.743243 0.642857 0.636773
|
||||||
|
3 SVM 0.743902 0.677778 0.824324 0.700000 0.692465
|
||||||
|
4 MLP 0.716981 0.670588 0.770270 0.678571 0.673014
|
||||||
|
5 Decision Tree 0.616438 0.625000 0.608108 0.600000 0.599509
|
||||||
|
6 Extra Trees 0.697368 0.679487 0.716216 0.671429 0.668714
|
||||||
|
7 Random Forest 0.684211 0.666667 0.702703 0.657143 0.654382
|
||||||
|
8 XGBoost 0.666667 0.645570 0.689189 0.635714 0.632473)
|
||||||
|
|
||||||
|
# evolution + str features[7=3+4]
|
||||||
|
Model F1_Score Precision Recall Accuracy ROC_AUC
|
||||||
|
0 Logistic Regression 0.773006 0.707865 0.851351 0.735714 0.728706
|
||||||
|
1 Naive Bayes 0.750000 0.730769 0.770270 0.728571 0.726044
|
||||||
|
2 K-Nearest Neighbors 0.737500 0.686047 0.797297 0.700000 0.694103
|
||||||
|
3 SVM 0.763636 0.692308 0.851351 0.721429 0.713554
|
||||||
|
4 MLP 0.775758 0.703297 0.864865 0.735714 0.727887
|
||||||
|
5 Decision Tree 0.675497 0.662338 0.689189 0.650000 0.647625
|
||||||
|
6 Extra Trees 0.715232 0.701299 0.729730 0.692857 0.690622
|
||||||
|
7 Random Forest 0.715232 0.701299 0.729730 0.692857 0.690622
|
||||||
|
8 XGBoost 0.721519 0.678571 0.770270 0.685714 0.680590)
|
Loading…
Add table
Add a link
Reference in a new issue