added loopity_loop to run multiple models with stratified k-fold, got stuck in infinite loops and nested dicts
This commit is contained in:
parent
69d0c1b557
commit
7aead2d4f4
18 changed files with 287 additions and 62 deletions
172
my_data5.py
172
my_data5.py
|
@ -1,172 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Thu Mar 3 17:08:18 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.naive_bayes import BernoulliNB
|
||||
from sklearn.neighbors import KNeighborsClassifier
|
||||
from sklearn.svm import SVC
|
||||
from sklearn.tree import DecisionTreeClassifier
|
||||
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
|
||||
from sklearn.neural_network import MLPClassifier
|
||||
from sklearn.pipeline import Pipeline
|
||||
from xgboost import XGBClassifier
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.preprocessing import MinMaxScaler
|
||||
from sklearn.model_selection import train_test_split
|
||||
import os
|
||||
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score
|
||||
import pandas as pd
|
||||
#%%
|
||||
homedir = os.path.expanduser("~")
|
||||
os.chdir(homedir + "/git/ML_AI_training/test_data")
|
||||
|
||||
#gene 'pncA'
|
||||
#drug = 'pyrazinamide'
|
||||
|
||||
#==============
|
||||
# directories
|
||||
#==============
|
||||
datadir = homedir + '/git/Data/'
|
||||
indir = datadir + drug + '/input/'
|
||||
outdir = datadir + drug + '/output/'
|
||||
|
||||
#=======
|
||||
# input
|
||||
#=======
|
||||
# this needs to be merged_df2 or merged_df3?
|
||||
infile_ml1 = outdir + gene.lower() + '_merged_df3.csv'
|
||||
#infile_ml2 = outdir + gene.lower() + '_merged_df2.csv'
|
||||
|
||||
my_df = pd.read_csv(infile_ml1)
|
||||
my_df.dtypes
|
||||
my_df_cols = my_df.columns
|
||||
|
||||
gene_baiscL = ['pnca']
|
||||
geneL_naL = ['gid', 'rpob']
|
||||
geneL_ppi2L = ['alr', 'embb', 'katg', 'rpob']
|
||||
#%%============================================================================
|
||||
# GET Y
|
||||
# Y = my_df.loc[:,drug] #has NA
|
||||
dm_om_map = {'DM': 1, 'OM': 0}
|
||||
my_df['resistance'] = my_df['mutation_info_labels'].map(dm_om_map)
|
||||
|
||||
# sanity check
|
||||
my_df['resistance'].value_counts()
|
||||
my_df['mutation_info_labels'].value_counts()
|
||||
Y = my_df['resistance']
|
||||
|
||||
# GET X
|
||||
cols = my_df.columns
|
||||
X_stability = my_df[['ligand_distance'
|
||||
, 'ligand_affinity_change'
|
||||
, 'duet_stability_change'
|
||||
, 'ddg_foldx'
|
||||
, 'deepddg'
|
||||
, 'ddg_dynamut2']]
|
||||
|
||||
X_evol = my_df[['consurf_score'
|
||||
, 'snap2_score'
|
||||
, 'snap2_accuracy_pc']]
|
||||
|
||||
X_str = my_df[['asa'
|
||||
, 'rsa'
|
||||
, 'kd_values'
|
||||
, 'rd_values']]
|
||||
|
||||
#%% try combinations
|
||||
X_vars = X_stability
|
||||
X_vars = X_evol
|
||||
X_vars = X_str
|
||||
|
||||
X_vars = pd.concat([X_stability, X_evol, X_str], axis = 1)
|
||||
X_vars = pd.concat([X_stability, X_evol], axis = 1)
|
||||
X_vars = pd.concat([X_stability, X_str], axis = 1)
|
||||
X_vars = pd.concat([X_evol, X_str], axis = 1)
|
||||
|
||||
#%%
|
||||
X_vars.shape[1]
|
||||
|
||||
# TODO: stratified cross validate
|
||||
# Train-test Split
|
||||
rs = {'random_state': 42}
|
||||
X_train, X_test, y_train, y_test = train_test_split(X_vars,
|
||||
Y,
|
||||
test_size = 0.33,
|
||||
random_state = 42)
|
||||
|
||||
# Classification - Model Pipeline
|
||||
def modelPipeline(X_train, X_test, y_train, y_test):
|
||||
|
||||
log_reg = LogisticRegression(**rs)
|
||||
nb = BernoulliNB()
|
||||
knn = KNeighborsClassifier()
|
||||
svm = SVC(**rs)
|
||||
mlp = MLPClassifier(max_iter=500, **rs)
|
||||
dt = DecisionTreeClassifier(**rs)
|
||||
et = ExtraTreesClassifier(**rs)
|
||||
rf = RandomForestClassifier(**rs)
|
||||
xgb = XGBClassifier(**rs, verbosity=0)
|
||||
|
||||
clfs = [
|
||||
('Logistic Regression', log_reg),
|
||||
('Naive Bayes', nb),
|
||||
('K-Nearest Neighbors', knn),
|
||||
('SVM', svm),
|
||||
('MLP', mlp),
|
||||
('Decision Tree', dt),
|
||||
('Extra Trees', et),
|
||||
('Random Forest', rf),
|
||||
('XGBoost', xgb)
|
||||
]
|
||||
|
||||
|
||||
pipelines = []
|
||||
|
||||
scores_df = pd.DataFrame(columns=['Model', 'F1_Score', 'Precision', 'Recall', 'Accuracy', 'ROC_AUC'])
|
||||
|
||||
|
||||
for clf_name, clf in clfs:
|
||||
|
||||
pipeline = Pipeline(steps=[
|
||||
('scaler', MinMaxScaler()),
|
||||
('classifier', clf)
|
||||
]
|
||||
)
|
||||
pipeline.fit(X_train, y_train)
|
||||
|
||||
# Model predictions
|
||||
y_pred = pipeline.predict(X_test)
|
||||
|
||||
# F1-Score
|
||||
fscore = f1_score(y_test, y_pred)
|
||||
# Precision
|
||||
pres = precision_score(y_test, y_pred)
|
||||
# Recall
|
||||
rcall = recall_score(y_test, y_pred)
|
||||
# Accuracy
|
||||
accu = accuracy_score(y_test, y_pred)
|
||||
# ROC_AUC
|
||||
roc_auc = roc_auc_score(y_test, y_pred)
|
||||
|
||||
|
||||
pipelines.append(pipeline)
|
||||
|
||||
scores_df = scores_df.append({
|
||||
'Model' : clf_name,
|
||||
'F1_Score' : fscore,
|
||||
'Precision' : pres,
|
||||
'Recall' : rcall,
|
||||
'Accuracy' : accu,
|
||||
'ROC_AUC' : roc_auc
|
||||
|
||||
},
|
||||
ignore_index = True)
|
||||
|
||||
return pipelines, scores_df
|
||||
|
||||
|
||||
modelPipeline(X_train, X_test, y_train, y_test)
|
Loading…
Add table
Add a link
Reference in a new issue