#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Thu Mar 3 17:08:18 2022 @author: tanu """ from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import BernoulliNB from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier from sklearn.neural_network import MLPClassifier from sklearn.pipeline import Pipeline from xgboost import XGBClassifier from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split import os from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score import pandas as pd #%% homedir = os.path.expanduser("~") os.chdir(homedir + "/git/ML_AI_training/test_data") # this needs to be merged_df2 or merged_df3? #gene 'pncA' drug = 'pyrazinamide' my_df = pd.read_csv("pnca_merged_df3.csv") my_df.dtypes my_df_cols = my_df.columns #%%============================================================================ # GET Y # Y = my_df.loc[:,drug] #has NA dm_om_map = {'DM': 1, 'OM': 0} my_df['resistance'] = my_df['mutation_info_labels'].map(dm_om_map) # sanity check my_df['resistance'].value_counts() my_df['mutation_info_labels'].value_counts() Y = my_df['resistance'] # GET X cols = my_df.columns X_stability = my_df[['ligand_distance' , 'ligand_affinity_change' , 'duet_stability_change' , 'ddg_foldx' , 'deepddg' , 'ddg_dynamut2']] X_evol = my_df[['consurf_score' , 'snap2_score' , 'snap2_accuracy_pc']] X_str = my_df[['asa' , 'rsa' , 'kd_values' , 'rd_values']] #%% try combinations X_vars = X_stability X_vars = X_evol X_vars = X_str X_vars = pd.concat([X_stability, X_evol, X_str], axis = 1) X_vars = pd.concat([X_stability, X_evol], axis = 1) X_vars = pd.concat([X_stability, X_str], axis = 1) X_vars = pd.concat([X_evol, X_str], axis = 1) #%% X_vars.shape[1] # TODO: stratified cross validate # Train-test Split rs = {'random_state': 42} X_train, X_test, y_train, y_test = train_test_split(X_vars, Y, test_size = 0.33, random_state = 42) # Classification - Model Pipeline def modelPipeline(X_train, X_test, y_train, y_test): log_reg = LogisticRegression(**rs) nb = BernoulliNB() knn = KNeighborsClassifier() svm = SVC(**rs) mlp = MLPClassifier(max_iter=500, **rs) dt = DecisionTreeClassifier(**rs) et = ExtraTreesClassifier(**rs) rf = RandomForestClassifier(**rs) xgb = XGBClassifier(**rs, verbosity=0) clfs = [ ('Logistic Regression', log_reg), ('Naive Bayes', nb), ('K-Nearest Neighbors', knn), ('SVM', svm), ('MLP', mlp), ('Decision Tree', dt), ('Extra Trees', et), ('Random Forest', rf), ('XGBoost', xgb) ] pipelines = [] scores_df = pd.DataFrame(columns=['Model', 'F1_Score', 'Precision', 'Recall', 'Accuracy', 'ROC_AUC']) for clf_name, clf in clfs: pipeline = Pipeline(steps=[ ('scaler', StandardScaler()), ('classifier', clf) ] ) pipeline.fit(X_train, y_train) # Model predictions y_pred = pipeline.predict(X_test) # F1-Score fscore = f1_score(y_test, y_pred) # Precision pres = precision_score(y_test, y_pred) # Recall rcall = recall_score(y_test, y_pred) # Accuracy accu = accuracy_score(y_test, y_pred) # ROC_AUC roc_auc = roc_auc_score(y_test, y_pred) pipelines.append(pipeline) scores_df = scores_df.append({ 'Model' : clf_name, 'F1_Score' : fscore, 'Precision' : pres, 'Recall' : rcall, 'Accuracy' : accu, 'ROC_AUC' : roc_auc }, ignore_index = True) return pipelines, scores_df modelPipeline(X_train, X_test, y_train, y_test)