From bff16fc219d889d2ecb465a88d638f8da33a2c80 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Thu, 3 Mar 2022 17:59:51 +0000
Subject: [PATCH] added my_data5.py to run multiple classifications algorithms
 and added prelim results

---
 my_data5.py           | 156 ++++++++++++++++++++++++++++++++++++++++++
 my_data5_results_pnca |  84 +++++++++++++++++++++++
 2 files changed, 240 insertions(+)
 create mode 100644 my_data5.py
 create mode 100644 my_data5_results_pnca

diff --git a/my_data5.py b/my_data5.py
new file mode 100644
index 0000000..500e6ba
--- /dev/null
+++ b/my_data5.py
@@ -0,0 +1,156 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Mar  3 17:08:18 2022
+
+@author: tanu
+"""
+from sklearn.linear_model import LogisticRegression
+from sklearn.naive_bayes import BernoulliNB
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.svm import SVC
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
+from sklearn.neural_network import MLPClassifier
+from sklearn.pipeline import Pipeline
+from xgboost import XGBClassifier
+from sklearn.preprocessing import StandardScaler
+from sklearn.model_selection import train_test_split
+import os
+from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score
+import pandas as pd
+#%%
+homedir = os.path.expanduser("~")
+os.chdir(homedir + "/git/ML_AI_training/test_data")
+
+# this needs to be merged_df2 or merged_df3?
+#gene 'pncA'
+drug = 'pyrazinamide'
+
+my_df = pd.read_csv("pnca_merged_df3.csv")
+
+my_df.dtypes
+my_df_cols = my_df.columns
+
+#%%============================================================================
+# GET Y
+# Y = my_df.loc[:,drug] #has NA
+dm_om_map = {'DM': 1, 'OM': 0}
+my_df['resistance'] = my_df['mutation_info_labels'].map(dm_om_map)
+
+# sanity check
+my_df['resistance'].value_counts()
+my_df['mutation_info_labels'].value_counts()
+Y = my_df['resistance']
+
+# GET X
+cols = my_df.columns
+X_stability = my_df[['ligand_distance'
+           , 'ligand_affinity_change'
+           , 'duet_stability_change'
+           , 'ddg_foldx'
+           , 'deepddg'
+           , 'ddg_dynamut2']]
+
+X_evol =  my_df[['consurf_score'
+           , 'snap2_score'
+           , 'snap2_accuracy_pc']]
+
+X_str =  my_df[['asa'
+           , 'rsa'
+           , 'kd_values'
+           , 'rd_values']]
+
+#%% try combinations
+X_vars = X_stability
+X_vars = X_evol
+X_vars = X_str
+
+X_vars = pd.concat([X_stability, X_evol, X_str], axis = 1)
+X_vars = pd.concat([X_stability, X_evol], axis = 1)
+X_vars = pd.concat([X_stability, X_str], axis = 1)
+X_vars = pd.concat([X_evol, X_str], axis = 1)
+
+#%%
+X_vars.shape[1]
+
+# TODO: stratified cross validate
+# Train-test Split
+rs = {'random_state': 42}
+X_train, X_test, y_train, y_test = train_test_split(X_vars, 
+                                                    Y, 
+                                                    test_size = 0.33, 
+                                                    random_state = 42)
+
+# Classification - Model Pipeline
+def modelPipeline(X_train, X_test, y_train, y_test):
+
+    log_reg = LogisticRegression(**rs)
+    nb = BernoulliNB()
+    knn = KNeighborsClassifier()
+    svm = SVC(**rs)
+    mlp = MLPClassifier(max_iter=500, **rs)
+    dt = DecisionTreeClassifier(**rs)
+    et = ExtraTreesClassifier(**rs)
+    rf = RandomForestClassifier(**rs)
+    xgb = XGBClassifier(**rs, verbosity=0)
+
+    clfs = [
+            ('Logistic Regression', log_reg), 
+            ('Naive Bayes', nb),
+            ('K-Nearest Neighbors', knn), 
+            ('SVM', svm), 
+            ('MLP', mlp), 
+            ('Decision Tree', dt), 
+            ('Extra Trees', et), 
+            ('Random Forest', rf), 
+            ('XGBoost', xgb)
+            ]
+
+
+    pipelines = []
+
+    scores_df = pd.DataFrame(columns=['Model', 'F1_Score', 'Precision', 'Recall', 'Accuracy', 'ROC_AUC'])
+
+
+    for clf_name, clf in clfs:
+
+        pipeline = Pipeline(steps=[
+                                   ('scaler', StandardScaler()),
+                                   ('classifier', clf)
+                                   ]
+                            )
+        pipeline.fit(X_train, y_train)
+
+        # Model predictions
+        y_pred  = pipeline.predict(X_test)
+        
+        # F1-Score
+        fscore  = f1_score(y_test, y_pred)
+        # Precision
+        pres    = precision_score(y_test, y_pred)
+        # Recall
+        rcall   = recall_score(y_test, y_pred)
+        # Accuracy
+        accu    = accuracy_score(y_test, y_pred)
+        # ROC_AUC
+        roc_auc = roc_auc_score(y_test, y_pred)
+
+
+        pipelines.append(pipeline)
+
+        scores_df = scores_df.append({
+                                      'Model'     : clf_name, 
+                                      'F1_Score'  : fscore,
+                                      'Precision' : pres,
+                                      'Recall'    : rcall,
+                                      'Accuracy'  : accu,
+                                      'ROC_AUC'   : roc_auc
+                                      
+                                      }, 
+                                     ignore_index = True)
+        
+    return pipelines, scores_df
+
+
+modelPipeline(X_train, X_test, y_train, y_test)
\ No newline at end of file
diff --git a/my_data5_results_pnca b/my_data5_results_pnca
new file mode 100644
index 0000000..005c7e3
--- /dev/null
+++ b/my_data5_results_pnca
@@ -0,0 +1,84 @@
+ # stabilty [6]
+                  Model  F1_Score  Precision    Recall  Accuracy   ROC_AUC
+ 0  Logistic Regression  0.738854   0.698795  0.783784  0.707143  0.702498
+ 1          Naive Bayes  0.627451   0.607595  0.648649  0.592857  0.589476
+ 2  K-Nearest Neighbors  0.731707   0.666667  0.810811  0.685714  0.678133
+ 3                  SVM  0.729412   0.645833  0.837838  0.671429  0.661343
+ 4                  MLP  0.670968   0.641975  0.702703  0.635714  0.631654
+ 5        Decision Tree  0.653595   0.632911  0.675676  0.621429  0.618141
+ 6          Extra Trees  0.733728   0.652632  0.837838  0.678571  0.668919
+ 7        Random Forest  0.726190   0.648936  0.824324  0.671429  0.662162
+ 8              XGBoost  0.704403   0.658824  0.756757  0.664286  0.658681)
+  
+ # evolution [3]
+                   Model  F1_Score  Precision    Recall  Accuracy   ROC_AUC
+ 0  Logistic Regression  0.795181   0.717391  0.891892  0.757143  0.748976
+ 1          Naive Bayes  0.805031   0.752941  0.864865  0.778571  0.773342
+ 2  K-Nearest Neighbors  0.735484   0.703704  0.770270  0.707143  0.703317
+ 3                  SVM  0.797619   0.712766  0.905405  0.757143  0.748157
+ 4                  MLP  0.787879   0.714286  0.878378  0.750000  0.742219
+ 5        Decision Tree  0.631579   0.615385  0.648649  0.600000  0.597052
+ 6          Extra Trees  0.688312   0.662500  0.716216  0.657143  0.653563
+ 7        Random Forest  0.704403   0.658824  0.756757  0.664286  0.658681
+ 8              XGBoost  0.713376   0.674699  0.756757  0.678571  0.673833)
+
+# str features [4]
+                  Model  F1_Score  Precision    Recall  Accuracy   ROC_AUC
+ 0  Logistic Regression  0.729412   0.645833  0.837838  0.671429  0.661343
+ 1          Naive Bayes  0.723926   0.662921  0.797297  0.678571  0.671376
+ 2  K-Nearest Neighbors  0.662338   0.637500  0.689189  0.628571  0.624898
+ 3                  SVM  0.727273   0.627451  0.864865  0.657143  0.644554
+ 4                  MLP  0.710843   0.641304  0.797297  0.657143  0.648649
+ 5        Decision Tree  0.561151   0.600000  0.527027  0.564286  0.566544
+ 6          Extra Trees  0.567376   0.597015  0.540541  0.564286  0.565725
+ 7        Random Forest  0.596026   0.584416  0.608108  0.564286  0.561630
+ 8              XGBoost  0.630872   0.626667  0.635135  0.607143  0.605446)
+ 
+ #=========================================================================
+ # stability + evolution + str features [13 = 6+3+4]
+                   Model  F1_Score  Precision    Recall  Accuracy   ROC_AUC
+ 0  Logistic Regression  0.726115   0.686747  0.770270  0.692857  0.688165
+ 1          Naive Bayes  0.730769   0.695122  0.770270  0.700000  0.695741
+ 2  K-Nearest Neighbors  0.742515   0.666667  0.837838  0.692857  0.684070
+ 3                  SVM  0.763636   0.692308  0.851351  0.721429  0.713554
+ 4                  MLP  0.717949   0.682927  0.756757  0.685714  0.681409
+ 5        Decision Tree  0.671429   0.712121  0.635135  0.671429  0.673628
+ 6          Extra Trees  0.756410   0.719512  0.797297  0.728571  0.724406
+ 7        Random Forest  0.742138   0.694118  0.797297  0.707143  0.701679
+ 8              XGBoost  0.692810   0.670886  0.716216  0.664286  0.661138)
+  
+ # stability + evolution [9=6+3]
+                   Model  F1_Score  Precision    Recall  Accuracy   ROC_AUC
+ 0  Logistic Regression  0.729560   0.682353  0.783784  0.692857  0.687346
+ 1          Naive Bayes  0.743590   0.707317  0.783784  0.714286  0.710074
+ 2  K-Nearest Neighbors  0.720497   0.666667  0.783784  0.678571  0.672195
+ 3                  SVM  0.771084   0.695652  0.864865  0.728571  0.720311
+ 4                  MLP  0.679739   0.658228  0.702703  0.650000  0.646806
+ 5        Decision Tree  0.620690   0.633803  0.608108  0.607143  0.607084
+ 6          Extra Trees  0.727273   0.700000  0.756757  0.700000  0.696560
+ 7        Random Forest  0.734177   0.690476  0.783784  0.700000  0.694922
+ 8              XGBoost  0.675497   0.662338  0.689189  0.650000  0.647625)
+ 
+ # stability + str features [10=6+4]
+                   Model  F1_Score  Precision    Recall  Accuracy   ROC_AUC
+ 0  Logistic Regression  0.750000   0.697674  0.810811  0.714286  0.708436
+ 1          Naive Bayes  0.714286   0.687500  0.743243  0.685714  0.682228
+ 2  K-Nearest Neighbors  0.687500   0.639535  0.743243  0.642857  0.636773
+ 3                  SVM  0.743902   0.677778  0.824324  0.700000  0.692465
+ 4                  MLP  0.716981   0.670588  0.770270  0.678571  0.673014
+ 5        Decision Tree  0.616438   0.625000  0.608108  0.600000  0.599509
+ 6          Extra Trees  0.697368   0.679487  0.716216  0.671429  0.668714
+ 7        Random Forest  0.684211   0.666667  0.702703  0.657143  0.654382
+ 8              XGBoost  0.666667   0.645570  0.689189  0.635714  0.632473)
+ 
+ # evolution + str features[7=3+4]
+                   Model  F1_Score  Precision    Recall  Accuracy   ROC_AUC
+ 0  Logistic Regression  0.773006   0.707865  0.851351  0.735714  0.728706
+ 1          Naive Bayes  0.750000   0.730769  0.770270  0.728571  0.726044
+ 2  K-Nearest Neighbors  0.737500   0.686047  0.797297  0.700000  0.694103
+ 3                  SVM  0.763636   0.692308  0.851351  0.721429  0.713554
+ 4                  MLP  0.775758   0.703297  0.864865  0.735714  0.727887
+ 5        Decision Tree  0.675497   0.662338  0.689189  0.650000  0.647625
+ 6          Extra Trees  0.715232   0.701299  0.729730  0.692857  0.690622
+ 7        Random Forest  0.715232   0.701299  0.729730  0.692857  0.690622
+ 8              XGBoost  0.721519   0.678571  0.770270  0.685714  0.680590)