added my_data5.py to run multiple classifications algorithms and added prelim results

This commit is contained in:
Tanushree Tunstall 2022-03-03 17:59:51 +00:00
parent 1fecbc15c9
commit bff16fc219
2 changed files with 240 additions and 0 deletions

156
my_data5.py Normal file
View file

@ -0,0 +1,156 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 3 17:08:18 2022
@author: tanu
"""
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import os
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score
import pandas as pd
#%%
homedir = os.path.expanduser("~")
os.chdir(homedir + "/git/ML_AI_training/test_data")
# this needs to be merged_df2 or merged_df3?
#gene 'pncA'
drug = 'pyrazinamide'
my_df = pd.read_csv("pnca_merged_df3.csv")
my_df.dtypes
my_df_cols = my_df.columns
#%%============================================================================
# GET Y
# Y = my_df.loc[:,drug] #has NA
dm_om_map = {'DM': 1, 'OM': 0}
my_df['resistance'] = my_df['mutation_info_labels'].map(dm_om_map)
# sanity check
my_df['resistance'].value_counts()
my_df['mutation_info_labels'].value_counts()
Y = my_df['resistance']
# GET X
cols = my_df.columns
X_stability = my_df[['ligand_distance'
, 'ligand_affinity_change'
, 'duet_stability_change'
, 'ddg_foldx'
, 'deepddg'
, 'ddg_dynamut2']]
X_evol = my_df[['consurf_score'
, 'snap2_score'
, 'snap2_accuracy_pc']]
X_str = my_df[['asa'
, 'rsa'
, 'kd_values'
, 'rd_values']]
#%% try combinations
X_vars = X_stability
X_vars = X_evol
X_vars = X_str
X_vars = pd.concat([X_stability, X_evol, X_str], axis = 1)
X_vars = pd.concat([X_stability, X_evol], axis = 1)
X_vars = pd.concat([X_stability, X_str], axis = 1)
X_vars = pd.concat([X_evol, X_str], axis = 1)
#%%
X_vars.shape[1]
# TODO: stratified cross validate
# Train-test Split
rs = {'random_state': 42}
X_train, X_test, y_train, y_test = train_test_split(X_vars,
Y,
test_size = 0.33,
random_state = 42)
# Classification - Model Pipeline
def modelPipeline(X_train, X_test, y_train, y_test):
log_reg = LogisticRegression(**rs)
nb = BernoulliNB()
knn = KNeighborsClassifier()
svm = SVC(**rs)
mlp = MLPClassifier(max_iter=500, **rs)
dt = DecisionTreeClassifier(**rs)
et = ExtraTreesClassifier(**rs)
rf = RandomForestClassifier(**rs)
xgb = XGBClassifier(**rs, verbosity=0)
clfs = [
('Logistic Regression', log_reg),
('Naive Bayes', nb),
('K-Nearest Neighbors', knn),
('SVM', svm),
('MLP', mlp),
('Decision Tree', dt),
('Extra Trees', et),
('Random Forest', rf),
('XGBoost', xgb)
]
pipelines = []
scores_df = pd.DataFrame(columns=['Model', 'F1_Score', 'Precision', 'Recall', 'Accuracy', 'ROC_AUC'])
for clf_name, clf in clfs:
pipeline = Pipeline(steps=[
('scaler', StandardScaler()),
('classifier', clf)
]
)
pipeline.fit(X_train, y_train)
# Model predictions
y_pred = pipeline.predict(X_test)
# F1-Score
fscore = f1_score(y_test, y_pred)
# Precision
pres = precision_score(y_test, y_pred)
# Recall
rcall = recall_score(y_test, y_pred)
# Accuracy
accu = accuracy_score(y_test, y_pred)
# ROC_AUC
roc_auc = roc_auc_score(y_test, y_pred)
pipelines.append(pipeline)
scores_df = scores_df.append({
'Model' : clf_name,
'F1_Score' : fscore,
'Precision' : pres,
'Recall' : rcall,
'Accuracy' : accu,
'ROC_AUC' : roc_auc
},
ignore_index = True)
return pipelines, scores_df
modelPipeline(X_train, X_test, y_train, y_test)

84
my_data5_results_pnca Normal file
View file

@ -0,0 +1,84 @@
# stabilty [6]
Model F1_Score Precision Recall Accuracy ROC_AUC
0 Logistic Regression 0.738854 0.698795 0.783784 0.707143 0.702498
1 Naive Bayes 0.627451 0.607595 0.648649 0.592857 0.589476
2 K-Nearest Neighbors 0.731707 0.666667 0.810811 0.685714 0.678133
3 SVM 0.729412 0.645833 0.837838 0.671429 0.661343
4 MLP 0.670968 0.641975 0.702703 0.635714 0.631654
5 Decision Tree 0.653595 0.632911 0.675676 0.621429 0.618141
6 Extra Trees 0.733728 0.652632 0.837838 0.678571 0.668919
7 Random Forest 0.726190 0.648936 0.824324 0.671429 0.662162
8 XGBoost 0.704403 0.658824 0.756757 0.664286 0.658681)
# evolution [3]
Model F1_Score Precision Recall Accuracy ROC_AUC
0 Logistic Regression 0.795181 0.717391 0.891892 0.757143 0.748976
1 Naive Bayes 0.805031 0.752941 0.864865 0.778571 0.773342
2 K-Nearest Neighbors 0.735484 0.703704 0.770270 0.707143 0.703317
3 SVM 0.797619 0.712766 0.905405 0.757143 0.748157
4 MLP 0.787879 0.714286 0.878378 0.750000 0.742219
5 Decision Tree 0.631579 0.615385 0.648649 0.600000 0.597052
6 Extra Trees 0.688312 0.662500 0.716216 0.657143 0.653563
7 Random Forest 0.704403 0.658824 0.756757 0.664286 0.658681
8 XGBoost 0.713376 0.674699 0.756757 0.678571 0.673833)
# str features [4]
Model F1_Score Precision Recall Accuracy ROC_AUC
0 Logistic Regression 0.729412 0.645833 0.837838 0.671429 0.661343
1 Naive Bayes 0.723926 0.662921 0.797297 0.678571 0.671376
2 K-Nearest Neighbors 0.662338 0.637500 0.689189 0.628571 0.624898
3 SVM 0.727273 0.627451 0.864865 0.657143 0.644554
4 MLP 0.710843 0.641304 0.797297 0.657143 0.648649
5 Decision Tree 0.561151 0.600000 0.527027 0.564286 0.566544
6 Extra Trees 0.567376 0.597015 0.540541 0.564286 0.565725
7 Random Forest 0.596026 0.584416 0.608108 0.564286 0.561630
8 XGBoost 0.630872 0.626667 0.635135 0.607143 0.605446)
#=========================================================================
# stability + evolution + str features [13 = 6+3+4]
Model F1_Score Precision Recall Accuracy ROC_AUC
0 Logistic Regression 0.726115 0.686747 0.770270 0.692857 0.688165
1 Naive Bayes 0.730769 0.695122 0.770270 0.700000 0.695741
2 K-Nearest Neighbors 0.742515 0.666667 0.837838 0.692857 0.684070
3 SVM 0.763636 0.692308 0.851351 0.721429 0.713554
4 MLP 0.717949 0.682927 0.756757 0.685714 0.681409
5 Decision Tree 0.671429 0.712121 0.635135 0.671429 0.673628
6 Extra Trees 0.756410 0.719512 0.797297 0.728571 0.724406
7 Random Forest 0.742138 0.694118 0.797297 0.707143 0.701679
8 XGBoost 0.692810 0.670886 0.716216 0.664286 0.661138)
# stability + evolution [9=6+3]
Model F1_Score Precision Recall Accuracy ROC_AUC
0 Logistic Regression 0.729560 0.682353 0.783784 0.692857 0.687346
1 Naive Bayes 0.743590 0.707317 0.783784 0.714286 0.710074
2 K-Nearest Neighbors 0.720497 0.666667 0.783784 0.678571 0.672195
3 SVM 0.771084 0.695652 0.864865 0.728571 0.720311
4 MLP 0.679739 0.658228 0.702703 0.650000 0.646806
5 Decision Tree 0.620690 0.633803 0.608108 0.607143 0.607084
6 Extra Trees 0.727273 0.700000 0.756757 0.700000 0.696560
7 Random Forest 0.734177 0.690476 0.783784 0.700000 0.694922
8 XGBoost 0.675497 0.662338 0.689189 0.650000 0.647625)
# stability + str features [10=6+4]
Model F1_Score Precision Recall Accuracy ROC_AUC
0 Logistic Regression 0.750000 0.697674 0.810811 0.714286 0.708436
1 Naive Bayes 0.714286 0.687500 0.743243 0.685714 0.682228
2 K-Nearest Neighbors 0.687500 0.639535 0.743243 0.642857 0.636773
3 SVM 0.743902 0.677778 0.824324 0.700000 0.692465
4 MLP 0.716981 0.670588 0.770270 0.678571 0.673014
5 Decision Tree 0.616438 0.625000 0.608108 0.600000 0.599509
6 Extra Trees 0.697368 0.679487 0.716216 0.671429 0.668714
7 Random Forest 0.684211 0.666667 0.702703 0.657143 0.654382
8 XGBoost 0.666667 0.645570 0.689189 0.635714 0.632473)
# evolution + str features[7=3+4]
Model F1_Score Precision Recall Accuracy ROC_AUC
0 Logistic Regression 0.773006 0.707865 0.851351 0.735714 0.728706
1 Naive Bayes 0.750000 0.730769 0.770270 0.728571 0.726044
2 K-Nearest Neighbors 0.737500 0.686047 0.797297 0.700000 0.694103
3 SVM 0.763636 0.692308 0.851351 0.721429 0.713554
4 MLP 0.775758 0.703297 0.864865 0.735714 0.727887
5 Decision Tree 0.675497 0.662338 0.689189 0.650000 0.647625
6 Extra Trees 0.715232 0.701299 0.729730 0.692857 0.690622
7 Random Forest 0.715232 0.701299 0.729730 0.692857 0.690622
8 XGBoost 0.721519 0.678571 0.770270 0.685714 0.680590)