From 877862acb783928db9395fd03672cb3a2a3adc24 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Fri, 4 Mar 2022 19:16:04 +0000 Subject: [PATCH] added count for targets for all genes and ran multiple classification models for all of the genes and target as a start --- MultClassPipe.py | 95 +++++++ X_categories | 45 +++ __pycache__/MultClassPipe.cpython-37.pyc | Bin 0 -> 2275 bytes ml_data/.Rhistory | 335 +++++++++++++++++++++++ ml_data/del/ml_data_v1.R | 65 +++++ my_data6.py | 171 ++++++++++++ my_data_gid.py | 156 +++++++++++ my_data_target_counts.py | 81 ++++++ 8 files changed, 948 insertions(+) create mode 100644 MultClassPipe.py create mode 100644 X_categories create mode 100644 __pycache__/MultClassPipe.cpython-37.pyc create mode 100644 ml_data/.Rhistory create mode 100644 ml_data/del/ml_data_v1.R create mode 100644 my_data6.py create mode 100644 my_data_gid.py create mode 100644 my_data_target_counts.py diff --git a/MultClassPipe.py b/MultClassPipe.py new file mode 100644 index 0000000..592c193 --- /dev/null +++ b/MultClassPipe.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Fri Mar 4 15:25:33 2022 + +@author: tanu +""" +#%% +import os, sys +import pandas as pd +from sklearn.linear_model import LogisticRegression +from sklearn.naive_bayes import BernoulliNB +from sklearn.neighbors import KNeighborsClassifier +from sklearn.svm import SVC +from sklearn.tree import DecisionTreeClassifier +from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier +from sklearn.neural_network import MLPClassifier +from sklearn.pipeline import Pipeline +from xgboost import XGBClassifier +from sklearn.preprocessing import StandardScaler +from sklearn.preprocessing import MinMaxScaler +from sklearn.model_selection import train_test_split +from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score +#%% +rs = {'random_state': 42} + +# Multiple Classification - Model Pipeline +def MultClassPipeline(X_train, X_test, y_train, y_test): + + log_reg = LogisticRegression(**rs) + nb = BernoulliNB() + knn = KNeighborsClassifier() + svm = SVC(**rs) + mlp = MLPClassifier(max_iter=500, **rs) + dt = DecisionTreeClassifier(**rs) + et = ExtraTreesClassifier(**rs) + rf = RandomForestClassifier(**rs) + xgb = XGBClassifier(**rs, verbosity=0) + + clfs = [ + ('Logistic Regression', log_reg), + ('Naive Bayes', nb), + ('K-Nearest Neighbors', knn), + ('SVM', svm), + ('MLP', mlp), + ('Decision Tree', dt), + ('Extra Trees', et), + ('Random Forest', rf), + ('XGBoost', xgb) + ] + + + pipelines = [] + + scores_df = pd.DataFrame(columns=['Model', 'F1_Score', 'Precision', 'Recall', 'Accuracy', 'ROC_AUC']) + + for clf_name, clf in clfs: + + pipeline = Pipeline(steps=[ + ('scaler', MinMaxScaler()), + #('scaler', StandardScaler()), + ('classifier', clf) + ] + ) + pipeline.fit(X_train, y_train) + + # Model predictions + y_pred = pipeline.predict(X_test) + + # F1-Score + fscore = f1_score(y_test, y_pred) + # Precision + pres = precision_score(y_test, y_pred) + # Recall + rcall = recall_score(y_test, y_pred) + # Accuracy + accu = accuracy_score(y_test, y_pred) + # ROC_AUC + roc_auc = roc_auc_score(y_test, y_pred) + + pipelines.append(pipeline) + + scores_df = scores_df.append({ + 'Model' : clf_name, + 'F1_Score' : fscore, + 'Precision' : pres, + 'Recall' : rcall, + 'Accuracy' : accu, + 'ROC_AUC' : roc_auc + + }, + ignore_index = True) + + return pipelines, scores_df + diff --git a/X_categories b/X_categories new file mode 100644 index 0000000..f121551 --- /dev/null +++ b/X_categories @@ -0,0 +1,45 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Fri Mar 4 15:09:37 2022 + +@author: tanu +""" + +X_categ_str = ['ss_class' + , 'wt_prop_water' + , 'mut_prop_water' + , 'wt_prop_polarity' + , 'mut_prop_polarity' + , 'wt_calcprop' + , 'mut_calcprop' + , 'active_aa_pos'] + +# only valid if we use merged_df2 +X_categ_str_lin = X_categ_str + ['lineage_labels'] + +X_categ_foldx = ['contacts' +'electro_rr' +'electro_mm' +'electro_sm' +'electro_ss' +'disulfide_rr' +'disulfide_mm' +'disulfide_sm' +'disulfide_ss' +'hbonds_rr' +'hbonds_mm' +'hbonds_sm' +'hbonds_ss' +'partcov_rr' +'partcov_mm' +'partcov_sm' +'partcov_ss' +'vdwclashes_rr' +'vdwclashes_mm' +'vdwclashes_sm' +'vdwclashes_ss' +'volumetric_rr' +'volumetric_mm' +'volumetric_sm' +'volumetric_ss'] diff --git a/__pycache__/MultClassPipe.cpython-37.pyc b/__pycache__/MultClassPipe.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3e4d465540bd02debdbd99c6342f8d1fbc1129c8 GIT binary patch literal 2275 zcmY*bNpssa6edB5q^Nzhot1IYMV)EfILRf`=_GdQHj~KB*lC;Lw8NnZNMQn!DhRo@ zhdQO#I?}FlWdNID_U5YPzmt)s+<15~kc->o%uXpLc;6DWAPgrg2#@_NgmsBV zm?p!CT)rO%oJB{XK4$X`&0Q+c4k(?ssw@rtU=q%oCE0`-4~ejcHTz{=k3mm=FrbTW5n!GT#TD&+^T8?zC|AD~I=H|Sw z#H~4VxIJfOUXlzOxDd_!E4Y{!h6tN^5zxv_Ks&bpOSui`YF>NO&0BAEn5F)_ zn>X@K-pzaYO1_F$|1|L$KKBYu>%aWUgff6YV=PMd10L+KASO)8nN?+nF^ssdAbxNZa@Ljhri7M+ zx`Z}>FoW@!B)Go@r{b#^6($fkWnmpJ3_z5DrX!z`kwD3TFkU2yFtWo~7%?3SgtX=HU<+qOLsDTT^}Ed<_8v; z&@R9<*&cv~9NmQC=q;$Sv<%rkTbx1xt)d!2x{Yc^#c&JiSRWijCwoCb+R1gfoPhbUl;und?L}82L%~0gp`gSen#-QW z7eVR8D=+MDH%dIBvaoQ745cE`SeQtXk@x^|wvv<-m*_3}5(3$o`U&B`rtF2%;OW4{ zohiu)3$9#-loF-aUCaR2H-`I?U$bPy;D5qQoZia9LG9Gf2qhthyw7Uj;?