added my_data5.py to run multiple classifications algorithms and added prelim results

2022-03-03 17:59:51 +00:00 · 2022-03-03 17:59:51 +00:00 · bff16fc219
commit bff16fc219
parent 1fecbc15c9
2 changed files with 240 additions and 0 deletions
--- a/my_data5.py
+++ b/my_data5.py
@ -0,0 +1,156 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Thu Mar  3 17:08:18 2022
@author: tanu
 """
 from sklearn.linear_model import LogisticRegression
 from sklearn.naive_bayes import BernoulliNB
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.svm import SVC
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
 from sklearn.neural_network import MLPClassifier
 from sklearn.pipeline import Pipeline
 from xgboost import XGBClassifier
 from sklearn.preprocessing import StandardScaler
 from sklearn.model_selection import train_test_split
 import os
 from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score
 import pandas as pd
 #%%
 homedir = os.path.expanduser("~")
 os.chdir(homedir + "/git/ML_AI_training/test_data")
 # this needs to be merged_df2 or merged_df3?
 #gene 'pncA'
 drug = 'pyrazinamide'
 my_df = pd.read_csv("pnca_merged_df3.csv")
 my_df.dtypes
 my_df_cols = my_df.columns
 #%%============================================================================
 # GET Y
 # Y = my_df.loc[:,drug] #has NA
 dm_om_map = {'DM': 1, 'OM': 0}
 my_df['resistance'] = my_df['mutation_info_labels'].map(dm_om_map)
 # sanity check
 my_df['resistance'].value_counts()
 my_df['mutation_info_labels'].value_counts()
 Y = my_df['resistance']
 # GET X
 cols = my_df.columns
 X_stability = my_df[['ligand_distance'
           , 'ligand_affinity_change'
           , 'duet_stability_change'
           , 'ddg_foldx'
           , 'deepddg'
           , 'ddg_dynamut2']]
 X_evol =  my_df[['consurf_score'
           , 'snap2_score'
           , 'snap2_accuracy_pc']]
 X_str =  my_df[['asa'
           , 'rsa'
           , 'kd_values'
           , 'rd_values']]
 #%% try combinations
 X_vars = X_stability
 X_vars = X_evol
 X_vars = X_str
 X_vars = pd.concat([X_stability, X_evol, X_str], axis = 1)
 X_vars = pd.concat([X_stability, X_evol], axis = 1)
 X_vars = pd.concat([X_stability, X_str], axis = 1)
 X_vars = pd.concat([X_evol, X_str], axis = 1)
 #%%
 X_vars.shape[1]
 # TODO: stratified cross validate
 # Train-test Split
 rs = {'random_state': 42}
 X_train, X_test, y_train, y_test = train_test_split(X_vars, 
                                                    Y, 
                                                    test_size = 0.33, 
                                                    random_state = 42)
 # Classification - Model Pipeline
 def modelPipeline(X_train, X_test, y_train, y_test):
    log_reg = LogisticRegression(**rs)
    nb = BernoulliNB()
    knn = KNeighborsClassifier()
    svm = SVC(**rs)
    mlp = MLPClassifier(max_iter=500, **rs)
    dt = DecisionTreeClassifier(**rs)
    et = ExtraTreesClassifier(**rs)
    rf = RandomForestClassifier(**rs)
    xgb = XGBClassifier(**rs, verbosity=0)
    clfs = [
            ('Logistic Regression', log_reg), 
            ('Naive Bayes', nb),
            ('K-Nearest Neighbors', knn), 
            ('SVM', svm), 
            ('MLP', mlp), 
            ('Decision Tree', dt), 
            ('Extra Trees', et), 
            ('Random Forest', rf), 
            ('XGBoost', xgb)
            ]
    pipelines = []
    scores_df = pd.DataFrame(columns=['Model', 'F1_Score', 'Precision', 'Recall', 'Accuracy', 'ROC_AUC'])
    for clf_name, clf in clfs:
        pipeline = Pipeline(steps=[
                                   ('scaler', StandardScaler()),
                                   ('classifier', clf)
                                   ]
                            )
        pipeline.fit(X_train, y_train)
        # Model predictions
        y_pred  = pipeline.predict(X_test)
        # F1-Score
        fscore  = f1_score(y_test, y_pred)
        # Precision
        pres    = precision_score(y_test, y_pred)
        # Recall
        rcall   = recall_score(y_test, y_pred)
        # Accuracy
        accu    = accuracy_score(y_test, y_pred)
        # ROC_AUC
        roc_auc = roc_auc_score(y_test, y_pred)
        pipelines.append(pipeline)
        scores_df = scores_df.append({
                                      'Model'     : clf_name, 
                                      'F1_Score'  : fscore,
                                      'Precision' : pres,
                                      'Recall'    : rcall,
                                      'Accuracy'  : accu,
                                      'ROC_AUC'   : roc_auc
                                      }, 
                                     ignore_index = True)
    return pipelines, scores_df
 modelPipeline(X_train, X_test, y_train, y_test)
--- a/84
+++ b/84
@ -0,0 +1,84 @@
 # stabilty [6]
                  Model  F1_Score  Precision    Recall  Accuracy   ROC_AUC
 0  Logistic Regression  0.738854   0.698795  0.783784  0.707143  0.702498
 1          Naive Bayes  0.627451   0.607595  0.648649  0.592857  0.589476
 2  K-Nearest Neighbors  0.731707   0.666667  0.810811  0.685714  0.678133
 3                  SVM  0.729412   0.645833  0.837838  0.671429  0.661343
 4                  MLP  0.670968   0.641975  0.702703  0.635714  0.631654
 5        Decision Tree  0.653595   0.632911  0.675676  0.621429  0.618141
 6          Extra Trees  0.733728   0.652632  0.837838  0.678571  0.668919
 7        Random Forest  0.726190   0.648936  0.824324  0.671429  0.662162
 8              XGBoost  0.704403   0.658824  0.756757  0.664286  0.658681)
 # evolution [3]
                   Model  F1_Score  Precision    Recall  Accuracy   ROC_AUC
 0  Logistic Regression  0.795181   0.717391  0.891892  0.757143  0.748976
 1          Naive Bayes  0.805031   0.752941  0.864865  0.778571  0.773342
 2  K-Nearest Neighbors  0.735484   0.703704  0.770270  0.707143  0.703317
 3                  SVM  0.797619   0.712766  0.905405  0.757143  0.748157
 4                  MLP  0.787879   0.714286  0.878378  0.750000  0.742219
 5        Decision Tree  0.631579   0.615385  0.648649  0.600000  0.597052
 6          Extra Trees  0.688312   0.662500  0.716216  0.657143  0.653563
 7        Random Forest  0.704403   0.658824  0.756757  0.664286  0.658681
 8              XGBoost  0.713376   0.674699  0.756757  0.678571  0.673833)
 # str features [4]
                  Model  F1_Score  Precision    Recall  Accuracy   ROC_AUC
 0  Logistic Regression  0.729412   0.645833  0.837838  0.671429  0.661343
 1          Naive Bayes  0.723926   0.662921  0.797297  0.678571  0.671376
 2  K-Nearest Neighbors  0.662338   0.637500  0.689189  0.628571  0.624898
 3                  SVM  0.727273   0.627451  0.864865  0.657143  0.644554
 4                  MLP  0.710843   0.641304  0.797297  0.657143  0.648649
 5        Decision Tree  0.561151   0.600000  0.527027  0.564286  0.566544
 6          Extra Trees  0.567376   0.597015  0.540541  0.564286  0.565725
 7        Random Forest  0.596026   0.584416  0.608108  0.564286  0.561630
 8              XGBoost  0.630872   0.626667  0.635135  0.607143  0.605446)
 #=========================================================================
 # stability + evolution + str features [13 = 6+3+4]
                   Model  F1_Score  Precision    Recall  Accuracy   ROC_AUC
 0  Logistic Regression  0.726115   0.686747  0.770270  0.692857  0.688165
 1          Naive Bayes  0.730769   0.695122  0.770270  0.700000  0.695741
 2  K-Nearest Neighbors  0.742515   0.666667  0.837838  0.692857  0.684070
 3                  SVM  0.763636   0.692308  0.851351  0.721429  0.713554
 4                  MLP  0.717949   0.682927  0.756757  0.685714  0.681409
 5        Decision Tree  0.671429   0.712121  0.635135  0.671429  0.673628
 6          Extra Trees  0.756410   0.719512  0.797297  0.728571  0.724406
 7        Random Forest  0.742138   0.694118  0.797297  0.707143  0.701679
 8              XGBoost  0.692810   0.670886  0.716216  0.664286  0.661138)
 # stability + evolution [9=6+3]
                   Model  F1_Score  Precision    Recall  Accuracy   ROC_AUC
 0  Logistic Regression  0.729560   0.682353  0.783784  0.692857  0.687346
 1          Naive Bayes  0.743590   0.707317  0.783784  0.714286  0.710074
 2  K-Nearest Neighbors  0.720497   0.666667  0.783784  0.678571  0.672195
 3                  SVM  0.771084   0.695652  0.864865  0.728571  0.720311
 4                  MLP  0.679739   0.658228  0.702703  0.650000  0.646806
 5        Decision Tree  0.620690   0.633803  0.608108  0.607143  0.607084
 6          Extra Trees  0.727273   0.700000  0.756757  0.700000  0.696560
 7        Random Forest  0.734177   0.690476  0.783784  0.700000  0.694922
 8              XGBoost  0.675497   0.662338  0.689189  0.650000  0.647625)
 # stability + str features [10=6+4]
                   Model  F1_Score  Precision    Recall  Accuracy   ROC_AUC
 0  Logistic Regression  0.750000   0.697674  0.810811  0.714286  0.708436
 1          Naive Bayes  0.714286   0.687500  0.743243  0.685714  0.682228
 2  K-Nearest Neighbors  0.687500   0.639535  0.743243  0.642857  0.636773
 3                  SVM  0.743902   0.677778  0.824324  0.700000  0.692465
 4                  MLP  0.716981   0.670588  0.770270  0.678571  0.673014
 5        Decision Tree  0.616438   0.625000  0.608108  0.600000  0.599509
 6          Extra Trees  0.697368   0.679487  0.716216  0.671429  0.668714
 7        Random Forest  0.684211   0.666667  0.702703  0.657143  0.654382
 8              XGBoost  0.666667   0.645570  0.689189  0.635714  0.632473)
 # evolution + str features[7=3+4]
                   Model  F1_Score  Precision    Recall  Accuracy   ROC_AUC
 0  Logistic Regression  0.773006   0.707865  0.851351  0.735714  0.728706
 1          Naive Bayes  0.750000   0.730769  0.770270  0.728571  0.726044
 2  K-Nearest Neighbors  0.737500   0.686047  0.797297  0.700000  0.694103
 3                  SVM  0.763636   0.692308  0.851351  0.721429  0.713554
 4                  MLP  0.775758   0.703297  0.864865  0.735714  0.727887
 5        Decision Tree  0.675497   0.662338  0.689189  0.650000  0.647625
 6          Extra Trees  0.715232   0.701299  0.729730  0.692857  0.690622
 7        Random Forest  0.715232   0.701299  0.729730  0.692857  0.690622
 8              XGBoost  0.721519   0.678571  0.770270  0.685714  0.680590)