renamed hyperparams to gscv

This commit is contained in:
Tanushree Tunstall 2022-03-22 11:08:20 +00:00
parent a82358dbb4
commit ad5ebad7f8
31 changed files with 4433 additions and 0 deletions

Binary file not shown.

View file

@ -0,0 +1,264 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Mar 18 09:47:48 2022
@author: tanu
"""
#%% Useful links
# https://stackoverflow.com/questions/41844311/list-of-all-classification-algorithms
# https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html
# https://github.com/davidsbatista/machine-learning-notebooks/blob/master/hyperparameter-across-models.ipynb
# https://scikit-learn.org/stable/modules/svm.html#classification
# https://machinelearningmastery.com/hyperparameters-for-classification-machine-learning-algorithms/ # [params]
# https://uk.mathworks.com/help/stats/hyperparameter-optimization-in-classification-learner-app.html [ algo]
# As a general rule of thumb, it is required to run baseline models on the dataset. I know H2O- AutoML and other AutoML packages do this. But I want to try using Scikit-learn Pipeline,
# https://codereview.stackexchange.com/questions/256934/model-pipeline-to-run-multiple-classifiers-for-ml-classification
# https://uk.mathworks.com/help/stats/hyperparameter-optimization-in-classification-learner-app.html
# QDA: https://www.geeksforgeeks.org/quadratic-discriminant-analysis/
names = [
"Nearest Neighbors",
"Linear SVM",
"RBF SVM",
"Gaussian Process",
"Decision Tree",
"Random Forest",
"Neural Net",
"AdaBoost",
"Naive Bayes",
"QDA",
]
classifiers = [
KNeighborsClassifier(3),
SVC(kernel="linear", C=0.025),
SVC(gamma=2, C=1),
GaussianProcessClassifier(1.0 * RBF(1.0)),
DecisionTreeClassifier(max_depth=5),
RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
MLPClassifier(alpha=1, max_iter=1000),
AdaBoostClassifier(),
GaussianNB(),
QuadraticDiscriminantAnalysis(),
]
# NOTE Logistic regression
# The choice of the algorithm depends on the penalty chosen: Supported penalties by solver:
# newton-cg - [l2, none]
# lbfgs - [l2, none]
# liblinear - [l1, l2]
# sag - [l2, none]
# saga - [elasticnet, l1, l2, none]
# SVR?
# estimator=SVR(kernel='rbf')
# param_grid={
# 'C': [1.1, 5.4, 170, 1001],
# 'epsilon': [0.0003, 0.007, 0.0109, 0.019, 0.14, 0.05, 8, 0.2, 3, 2, 7],
# 'gamma': [0.7001, 0.008, 0.001, 3.1, 1, 1.3, 5]
# }
#%% Classification algorithms param grid
#%% LogisticRegression()
#https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
gs_lr = Pipeline((
('pre' , MinMaxScaler())
,('clf', LogisticRegression(**rs
, **njobs))
))
gs_lr_params = {
'clf__C' : [0.0001, 0.001, 0.01, 0.1 ,1, 10, 100]
#'C': np.logspace(-4, 4, 50)
, 'clf__penalty': ['l1', 'l2', 'elasticnet', 'none']
, 'clf__solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}
#%% DecisionTreeClassifier()
gs_dt = Pipeline((
('pre' , MinMaxScaler())
, ('clf', DecisionTreeClassifier(**rs
, **njobs))
))
gs_dt_params = {
'clf__max_depth': [ 2, 4, 6, 8, 10]
, 'clf__criterion':['gini','entropy']
, "clf__max_features":["auto", None]
, "clf__max_leaf_nodes":[10,20,30,40]
}
#%% KNeighborsClassifier()
#https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
gs_knn = Pipeline((
('pre' , MinMaxScaler())
,('clf', KNeighborsClassifier(**rs
, **njobs))
))
gs_knn_params = {
'clf__n_neighbors': [3, 7, 10]
#, 'clf__n_neighbors': range(1, 21, 2)
,'clf__metric' : ['euclidean', 'manhattan', 'minkowski']
, 'clf__weights' : ['uniform', 'distance']
}
#%% RandomForestClassifier()
gs_rf = Pipeline((
('pre' , MinMaxScaler())
,('clf', RandomForestClassifier(**rs
, **njobs
, bootstrap = True
, oob_score = True))
))
gs_rf_params = {
'clf__max_depth': [4, 6, 8, 10, 12, 16, 20, None]
, 'clf__class_weight':['balanced','balanced_subsample']
, 'clf__n_estimators': [10, 100, 1000]
, 'clf__criterion': ['gini', 'entropy']
, 'clf__max_features': ['auto', 'sqrt']
, 'clf__min_samples_leaf': [2, 4, 8, 50]
, 'clf__min_samples_split': [10, 20]
}
#%% XGBClassifier()
# https://stackoverflow.com/questions/34674797/xgboost-xgbclassifier-defaults-in-python
# https://stackoverflow.com/questions/34674797/xgboost-xgbclassifier-defaults-in-python
gs_xgb = Pipeline((
('pre' , MinMaxScaler())
,('clf', XGBClassifier(**rs
, **njobs))
))
gs_xgb_params = {
'clf__learning_rate': [0.01, 0.05, 0.1, 0.2]
, 'clf__max_depth': [4, 6, 8, 10, 12, 16, 20]
, 'clf__min_samples_leaf': [4, 8, 12, 16, 20]
, 'clf__max_features': ['auto', 'sqrt']
}
#%% MLPClassifier()
# https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html
gs_mlp = Pipeline((
('pre' , MinMaxScaler())
,('clf', MLPClassifier(**rs
, **njobs
, max_iter = 500))
))
gs_mlp_params = {
'clf__hidden_layer_sizes': [(1), (2), (3)]
, 'clf__max_features': ['auto', 'sqrt']
, 'clf__min_samples_leaf': [2, 4, 8]
, 'clf__min_samples_split': [10, 20]
}
#%% RidgeClassifier()
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RidgeClassifier.html
gs_rc = Pipeline((
('pre' , MinMaxScaler()) # CHECK if it wants -1 to 1
,('clf', RidgeClassifier(**rs
, **njobs))
))
gs_rc_params = {
'clf__alpha': [0.1, 0.2, 0.5, 0.8, 1.0]
}
#%% SVC()
# https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
gs_svc = Pipeline((
('pre' , MinMaxScaler()) # CHECK if it wants -1 to 1
,('clf', SVC(**rs
, **njobs))
))
gs_svc_params = {
'clf__kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'}
, 'clf__C' : [50, 10, 1.0, 0.1, 0.01]
, 'clf__gamma': ['scale', 'auto'] }
#%% BaggingClassifier()
#https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html
gs_bdt = Pipeline((
('pre' , MinMaxScaler()) # CHECK if it wants -1 to 1
,('clf', BaggingClassifier(**rs
, **njobs
, bootstrap = True
, oob_score = True))
))
gs_bdt_params = {
'clf__n_estimators' : [10, 100, 1000]
# If None, then the base estimator is a DecisionTreeClassifier.
, 'clf__base_estimator' : ['None', 'SVC()', 'KNeighborsClassifier()']# if none, DT is used
, 'clf__gamma': ['scale', 'auto'] }
#%% GradientBoostingClassifier()
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html
gs_gb = Pipeline((
('pre' , MinMaxScaler()) # CHECK if it wants -1 to 1
,('clf', GradientBoostingClassifier(**rs))
))
gs_bdt_params = {
'clf__n_estimators' : [10, 100, 1000]
, 'clf__n_estimators' : [10, 100, 1000]
, 'clf__learning_rate': [0.001, 0.01, 0.1]
, 'clf__subsample' : [0.5, 0.7, 1.0]
, 'clf__max_depth' : [3, 7, 9]
}
#%% AdaBoostClassifier()
#https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html#sklearn.ensemble.AdaBoostClassifier
gs_gb = Pipeline((
('pre' , MinMaxScaler()) # CHECK if it wants -1 to 1
,('clf', AdaBoostClassifier(**rs))
))
gs_bdt_params = {
'clf__n_estimators': [none, 1, 2]
, 'clf__base_estiamtor' : ['None', 1*SVC(), 1*KNeighborsClassifier()]
#, 'clf___splitter' : ["best", "random"]
}
#%% GaussianProcessClassifier()
# https://scikit-learn.org/stable/modules/generated/sklearn.gaussian_process.GaussianProcessClassifier.html
#GaussianProcessClassifier(1.0 * RBF(1.0)),
gs_gpc = Pipeline((
('pre' , MinMaxScaler()) # CHECK if it wants -1 to 1
,('clf', GaussianProcessClassifier(**rs))
))
gs_gpc_params = {
'clf__kernel': [1*RBF(), 1*DotProduct(), 1*Matern(), 1*RationalQuadratic(), 1*WhiteKernel()]
}
#%% GaussianNB()
# https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html
gs_gnb = Pipeline((
('pre' , MinMaxScaler())
, ('pca', PCA() )# CHECK if it wants -1 to 1
,('clf', GaussianNB(**rs))
))
gs_gnb_params = {
'clf__priors': [None]
, 'clf__var_smoothing': np.logspace(0,-9, num=100)
}
#%% QuadraticDiscriminantAnalysis()
#https://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis.html
gs_qda = Pipeline((
('pre' , MinMaxScaler())
#, ('pca', PCA() )# CHECK if it wants -1 to 1
,('clf', QuadraticDiscriminantAnalysis())
))
#%% BernoulliNB()
# https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.BernoulliNB.html
gs_gnb = Pipeline((
('pre' , MinMaxScaler())
,('clf', BernoulliNB())
))
BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
gs_gnb_params = {
'clf__alpha': [0, 1]
, 'clf__binarize':['None', 0]
, 'clf__fit_prior': [True]
, 'clf__class_prior': ['None']
}

128
earlier_versions/GSCV_base Normal file
View file

@ -0,0 +1,128 @@
# Logistic regression:
pnca
input: numerical features
output: dm/om: target
grid search/base estimator with a single model with hyperparamter choices: gives you the best model based on a SINGLE metric!
-- question: which is the metric to optimise for?
base estimator with multipe models and multiple hyperparams: returns the OVERALL best model-hyperparam combo, based on a single score?
-- question: which is the metric to optimise for?
# Demonstration
###################
# Metric1: accuracy
###################
Best model:
{'clf__max_iter': 100, 'clf__solver': 'liblinear'}
Best models score:
0.7145320197044336
###################
# Metric2: F1
###################
Best model:
{'clf__max_iter': 100, 'clf__solver': 'saga'}
Best models score:
0.7550294183111348
###################
# Metric3: Recall
###################
Best model:
{'clf__max_iter': 100, 'clf__solver': 'saga'}
Best models score:
0.8216666666666667
###################
# Metric4: ROC_AUC
###################
Best model:
{'clf__max_iter': 200, 'clf__solver': 'sag'}
Best models score:
0.7711904761904762
###################
# Metric5: MCC
###################
Best model:
{'clf__max_iter': 100, 'clf__solver': 'saga'}
Best models score:
0.4322970173039572
sklearn/linear_model/_sag.py:354: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
ConvergenceWarning,
#####################################
# Same thing but using: CLFSwitcher()
###################
# Metric1: Accuracy
###################
Best model:
{'clf__estimator': LogisticRegression(random_state=42, solver='liblinear')
, 'clf__estimator__max_iter': 100, 'clf__estimator__solver': 'liblinear'}
Best models score:
0.7219298245614035
###################
# Metric2: F1
###################
Best model:
{'clf__estimator': LogisticRegression(random_state=42, solver='liblinear'), 'clf__estimator__max_iter': 100, 'clf__estimator__solver': 'liblinear'}
print('Best models score:\n', gscv.best_score_)
Best models score:
0.7585724070894442
###################
# Metric3: Recall
###################
Best model:
{'clf__estimator': LogisticRegression(random_state=42, solver='liblinear')
, 'clf__estimator__max_iter': 100, 'clf__estimator__solver': 'liblinear'}
Best models score:
0.8198610213316095
###################
# Metric4: ROC_AUC
###################
Best model:
{'clf__estimator': LogisticRegression(solver='newton-cg')
, 'clf__estimator__max_iter': 100, 'clf__estimator__solver': 'newton-cg'}
Best models score:
nan
###################
# Metric5: MCC
###################
Best model:
{'clf__estimator': LogisticRegression(random_state=42, solver='liblinear')
, 'clf__estimator__max_iter': 100, 'clf__estimator__solver': 'liblin
Best models score:
0.4480248700902755
print('Best model:\n', gs_dt.best_params_)
Best model:
{'criterion': 'entropy', 'max_depth': 2, 'max_features': None, 'max_leaf_nodes': 10}
print('Best models score:\n', gs_dt.best_score_)
Best models score:
0.43290518915746007

View file

@ -0,0 +1,109 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Mar 4 15:25:33 2022
@author: tanu
"""
#%%
import os, sys
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef
#%%
rs = {'random_state': 42}
# TODO: add preprocessing step with one hot encoder
# Multiple Classification - Model Pipeline
def MultClassPipeline(X_train, X_test, y_train, y_test):
log_reg = LogisticRegression(**rs)
nb = BernoulliNB()
knn = KNeighborsClassifier()
svm = SVC(**rs)
mlp = MLPClassifier(max_iter=500, **rs)
dt = DecisionTreeClassifier(**rs)
et = ExtraTreesClassifier(**rs)
rf = RandomForestClassifier(**rs)
rf2 = RandomForestClassifier(
min_samples_leaf=50,
n_estimators=150,
bootstrap=True,
oob_score=True,
n_jobs=-1,
random_state=42,
max_features='auto')
xgb = XGBClassifier(**rs, verbosity=0)
clfs = [
('Logistic Regression', log_reg),
('Naive Bayes', nb),
('K-Nearest Neighbors', knn),
('SVM', svm),
('MLP', mlp),
('Decision Tree', dt),
('Extra Trees', et),
('Random Forest', rf),
('Random Forest2', rf2),
('XGBoost', xgb)
]
pipelines = []
scores_df = pd.DataFrame(columns=['Model', 'F1_Score', 'MCC', 'Precision', 'Recall', 'Accuracy', 'ROC_AUC'])
for clf_name, clf in clfs:
pipeline = Pipeline(steps=[
('scaler', MinMaxScaler()),
#('scaler', StandardScaler()),
('classifier', clf)
]
)
pipeline.fit(X_train, y_train)
# Model predictions
y_pred = pipeline.predict(X_test)
# F1-Score
fscore = f1_score(y_test, y_pred)
# Precision
pres = precision_score(y_test, y_pred)
# Recall
recall = recall_score(y_test, y_pred)
# Accuracy
accu = accuracy_score(y_test, y_pred)
# ROC_AUC
roc_auc = roc_auc_score(y_test, y_pred)
# Matthews correlation coefficient
mcc = matthews_corrcoef(y_test, y_pred)
pipelines.append(pipeline)
scores_df = scores_df.append({
'Model' : clf_name
, 'F1_Score' : fscore
, 'MCC' : mcc
, 'Precision' : pres
, 'Recall' : recall
, 'Accuracy' : accu
, 'ROC_AUC' : roc_auc
}
, ignore_index = True)
return pipelines, scores_df

View file

@ -0,0 +1,48 @@
# Stratified K-fold vs ShuffleSplit
https://stackoverflow.com/questions/45969390/difference-between-stratifiedkfold-and-stratifiedshufflesplit-in-sklearn
In ShuffleSplit, the data is shuffled every time, and then split. This means the test sets may overlap between the splits.
In SKF, test sets don't overlap
So, the difference here is that StratifiedKFold just shuffles and splits once, therefore the test sets do not overlap, while StratifiedShuffleSplit shuffles each time before splitting, and it splits n_splits times, the test sets can overlap.
Note: the two methods uses "stratified fold" (that why "stratified" appears in both names). It means each part preserves the same percentage of samples of each class (label) as the original data. You can read more at cross_validation documents
''' python code '''
splits = 5
tx = range(10)
ty = [0] * 5 + [1] * 5
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
from sklearn import datasets
kfold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=42)
shufflesplit = StratifiedShuffleSplit(n_splits=splits, random_state=42, test_size=2)
print("KFold")
for train_index, test_index in kfold.split(tx, ty):
print("TRAIN:", train_index, "TEST:", test_index)
print("Shuffle Split")
for train_index, test_index in shufflesplit.split(tx, ty):
print("TRAIN:", train_index, "TEST:", test_index)
'''
Output:
KFold
TRAIN: [0 2 3 4 5 6 7 9] TEST: [1 8]
TRAIN: [0 1 2 3 5 7 8 9] TEST: [4 6]
TRAIN: [0 1 3 4 5 6 8 9] TEST: [2 7]
TRAIN: [1 2 3 4 6 7 8 9] TEST: [0 5]
TRAIN: [0 1 2 4 5 6 7 8] TEST: [3 9]
Shuffle Split
TRAIN: [8 4 1 0 6 5 7 2] TEST: [3 9]
TRAIN: [7 0 3 9 4 5 1 6] TEST: [8 2]
TRAIN: [1 2 5 6 4 8 9 0] TEST: [3 7]
TRAIN: [4 6 7 8 3 5 1 2] TEST: [9 0]
TRAIN: [7 2 6 5 4 3 0 9] TEST: [1 8]

View file

@ -0,0 +1,29 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 10 10:59:36 2022
@author: tanu
"""
# numerical
#log_reg (rs)
F1_score 0.713380
MCC 0.376546
Precision 0.687628
Recall 0.747231
Accuracy 0.687293
ROC_curve 0.683199
#log_reg (balanced)
F1_score 0.715106
MCC 0.390225
Precision 0.702629
Recall 0.733445
Accuracy 0.694309
ROC_curve 0.691555
#log_reg (unbalanced)
F1_score 0.713380
MCC 0.376546
Precision 0.687628
Recall 0.747231
Accuracy 0.687293
ROC_curve 0.683199

View file

@ -0,0 +1,229 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Mar 6 13:41:54 2022
@author: tanu
"""
import os, sys
import pandas as pd
import numpy as np
#from copy import deepcopy
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef
from sklearn.metrics import make_scorer
from sklearn.metrics import classification_report
from sklearn.metrics import average_precision_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
import itertools
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
print(np.__version__)
print(pd.__version__)
from statistics import mean, stdev, median, mode
#%%
homedir = os.path.expanduser("~")
os.chdir(homedir + "/git/ML_AI_training/")
# my function
from MultClassPipe import MultClassPipeline
from MultClassPipe2 import MultClassPipeline2
from loopity_loop import MultClassPipeSKF
gene = 'pncA'
drug = 'pyrazinamide'
#==============
# directories
#==============
datadir = homedir + '/git/Data/'
indir = datadir + drug + '/input/'
outdir = datadir + drug + '/output/'
#=======
# input
#=======
infile_ml1 = outdir + gene.lower() + '_merged_df3.csv'
#infile_ml2 = outdir + gene.lower() + '_merged_df2.csv'
my_df = pd.read_csv(infile_ml1)
my_df.dtypes
my_df_cols = my_df.columns
geneL_basic = ['pnca']
geneL_na = ['gid']
geneL_na_ppi2 = ['rpob']
geneL_ppi2 = ['alr', 'embb', 'katg']
#%% get cols
mycols = my_df.columns
my_df['active_aa_pos'].dtype
my_df['active_aa_pos'] = my_df['active_aa_pos'].astype(object)
if gene.lower() in geneL_na_ppi2:
#D1148 get rid of
na_index = my_df['mutationinformation'].index[my_df['mcsm_na_affinity'].apply(np.isnan)]
my_df = my_df.drop(index=na_index)
#%%============================================================================
# GET Y
# Target1: mutation_info_labels
dm_om_map = {'DM': 1, 'OM': 0}
target1 = my_df['mutation_info_labels'].map(dm_om_map)
target1.value_counts()
# Target2: drug
drug_labels = drug + '_labels'
drug_labels
my_df[drug_labels] = my_df[drug].map({1: 'resistant', 0: 'sensitive'})
my_df[drug_labels].value_counts()
my_df[drug_labels] = my_df[drug_labels].fillna('unknown')
my_df[drug_labels].value_counts()
target2 = my_df[drug_labels]
# Target3: drtype [Binary]
drtype_labels = 'drtype_labels'
my_df[drtype_labels] = my_df['drtype'].map({'Sensitive' : 0
, 'Other' : 0
, 'Pre-MDR' : 1
, 'MDR' : 1
, 'Pre-XDR' : 1
, 'XDR' : 1})
# target3 = 'drtype' [Multinomial]
target3 = my_df[drtype_labels]
# target4
drtype_labels2 = 'drtype_labels2'
my_df[drtype_labels2] = my_df['drtype'].map({'Sensitive' : 0
, 'Other' : 0
, 'Pre-MDR' : 1
, 'MDR' : 1
, 'Pre-XDR' : 2
, 'XDR' : 2})
target4 = my_df[drtype_labels2]
# sanity checks
target1.value_counts()
my_df['mutation_info_labels'].value_counts()
target2.value_counts()
my_df[drug_labels].value_counts()
target3.value_counts()
my_df['drtype'].value_counts()
target4.value_counts()
my_df['drtype'].value_counts()
#%%
# GET X
common_cols_stabiltyN = ['ligand_distance'
, 'ligand_affinity_change'
, 'duet_stability_change'
, 'ddg_foldx'
, 'deepddg'
, 'ddg_dynamut2']
# Build stability columns ~ gene
if gene.lower() in geneL_basic:
x_stabilityN = common_cols_stabiltyN
if gene.lower() in geneL_ppi2:
x_stabilityN = common_cols_stabiltyN + ['mcsm_ppi2_affinity'
, 'interface_dist']
if gene.lower() in geneL_na:
x_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity']
if gene.lower() in geneL_na_ppi2:
x_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist']
#D1148 get rid of
#na_index = my_df['mutationinformation'].index[my_df['mcsm_na_affinity'].apply(np.isnan)]
#my_df = my_df.drop(index=na_index)
X_strFN = ['asa'
, 'rsa'
, 'kd_values'
, 'rd_values']
X_evolFN = ['consurf_score'
, 'snap2_score'
, 'snap2_accuracy_pc']
# TODO: ADD ED values
# Problematic due to NA: filling NA with unknown or string will make it categorical
# OPTIONS
# 1. Imputing: KNN or MICE or from distribution
# 2. Fill na with median or mode
# 3. Separate datset without including genomic features AT ALL for ML, then using this as a 'blind test set'
# this means the size of the training data gets reduced!
# 4. Remove genomic features from ML COMPLETELEY!
# X_genomicFN = ['af'
# , 'or_mychisq'
# , 'or_logistic'
# , 'or_fisher'
# , 'pval_fisher']
#%% try combinations
X_vars1 = my_df[x_stabilityN]
X_vars2 = my_df[X_strFN]
X_vars3 = my_df[X_evolFN]
X_vars5 = my_df[x_stabilityN + X_strFN]
X_vars6 = my_df[x_stabilityN + X_evolFN]
#X_vars7 = my_df[x_stabilityN + X_genomicFN]
X_vars8 = my_df[X_strFN + X_evolFN]
#X_vars9 = my_df[X_strFN + X_genomicFN]
#X_vars10 = my_df[X_evolFN + X_genomicFN]
X_vars11 = my_df[x_stabilityN + X_strFN + X_evolFN]
#X_vars12 = my_df[x_stabilityN + X_strFN + X_evolFN + X_genomicFN]
numerical_features_names = x_stabilityN + X_strFN + X_evolFN
# separate ones for foldx?
categorical_features_names = ['ss_class'
, 'wt_prop_water'
# , 'lineage_labels' # misleading if using merged_df3
, 'mut_prop_water'
, 'wt_prop_polarity'
, 'mut_prop_polarity'
, 'wt_calcprop'
, 'mut_calcprop'
, 'active_aa_pos']
numerical_features_df = my_df[numerical_features_names]
numerical_features_df.shape
categorical_features_df = my_df[categorical_features_names]
categorical_features_df.shape
all_features_df = my_df[numerical_features_names + categorical_features_names]
all_features_df.shape

View file

@ -0,0 +1,82 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 10 18:06:34 2022
@author: tanu
"""
#%%
models = [
('Logistic Regression' , log_reg)
, ('K-Nearest Neighbors', knn)
]
classification_metrics = {
'F1_score': []
,'MCC': []
,'Precision': []
,'Recall': []
,'Accuracy': []
,'ROC_curve': []
}
folds=[1,2]
fold_no=1
fold_dict={}
for model_name, model in models:
fold_dict.update({model_name: {}})
for f in folds:
fold=("fold_"+str(fold_no))
for model_name, model in models:
print("start of model", model_name, "fold: ", fold)
fold_dict[model_name].update({fold: {}})
fold_dict[model_name][fold].update(classification_metrics)
print("end of model", model_name, "fold: ", fold)
fold_dict[model_name][fold].update({'F1_score': random.randrange(1,10)})
fold_no +=1
pp.pprint(fold_dict)
#%%
folds_f1=[]
for model_name, model in models:
print("Calculating mean for F1_score for: ", model_name)
#for key in fold_dict['Logistic Regression']:
# wrap this in a classification_metric for loop
for key in fold_dict[model_name]:
folds_f1.append(fold_dict['Logistic Regression'][key]['F1_score'])
#folds_f1.append(folds_f1)
print('key:', key, 'F1scores:', folds_f1)
mean(folds_f1)
#%%
scores_df = pd.DataFrame(columns=['Model', 'F1_Score', 'MCC', 'Precision', 'Recall', 'Accuracy', 'ROC_AUC'])
# manually
model_name = 'Logistic Regression'
model_metric = 'F1_score'
log_reg_f1 = []
for key in fold_dict[model_name]:
log_reg_f1.append(fold_dict[model_name][key][model_metric])
log_reg_f1M = mean(log_reg_f1)
print('key:', key, model_metric, ':', log_reg_f1)
print(log_reg_f1M)
log_reg_f1df = pd.DataFrame({model_name: [log_reg_f1M]}, index = [model_metric])
log_reg_f1df
#%%
model_metric = 'MCC'
log_reg_mcc = []
for key in fold_dict[model_name]:
log_reg_mcc.append(fold_dict[model_name][key][model_metric])
log_reg_mccM = mean(log_reg_mcc)
print('key:', key, model_metric, ':', log_reg_mcc)
print(log_reg_mccM)
log_reg_mccdf = pd.DataFrame({model_name: [log_reg_mccM]}, index = [model_metric])
log_reg_mccdf

View file

@ -0,0 +1,84 @@
# stabilty [6]
Model F1_Score Precision Recall Accuracy ROC_AUC
0 Logistic Regression 0.738854 0.698795 0.783784 0.707143 0.702498
1 Naive Bayes 0.627451 0.607595 0.648649 0.592857 0.589476
2 K-Nearest Neighbors 0.731707 0.666667 0.810811 0.685714 0.678133
3 SVM 0.729412 0.645833 0.837838 0.671429 0.661343
4 MLP 0.670968 0.641975 0.702703 0.635714 0.631654
5 Decision Tree 0.653595 0.632911 0.675676 0.621429 0.618141
6 Extra Trees 0.733728 0.652632 0.837838 0.678571 0.668919
7 Random Forest 0.726190 0.648936 0.824324 0.671429 0.662162
8 XGBoost 0.704403 0.658824 0.756757 0.664286 0.658681)
# evolution [3]
Model F1_Score Precision Recall Accuracy ROC_AUC
0 Logistic Regression 0.795181 0.717391 0.891892 0.757143 0.748976
1 Naive Bayes 0.805031 0.752941 0.864865 0.778571 0.773342
2 K-Nearest Neighbors 0.735484 0.703704 0.770270 0.707143 0.703317
3 SVM 0.797619 0.712766 0.905405 0.757143 0.748157
4 MLP 0.787879 0.714286 0.878378 0.750000 0.742219
5 Decision Tree 0.631579 0.615385 0.648649 0.600000 0.597052
6 Extra Trees 0.688312 0.662500 0.716216 0.657143 0.653563
7 Random Forest 0.704403 0.658824 0.756757 0.664286 0.658681
8 XGBoost 0.713376 0.674699 0.756757 0.678571 0.673833)
# str features [4]
Model F1_Score Precision Recall Accuracy ROC_AUC
0 Logistic Regression 0.729412 0.645833 0.837838 0.671429 0.661343
1 Naive Bayes 0.723926 0.662921 0.797297 0.678571 0.671376
2 K-Nearest Neighbors 0.662338 0.637500 0.689189 0.628571 0.624898
3 SVM 0.727273 0.627451 0.864865 0.657143 0.644554
4 MLP 0.710843 0.641304 0.797297 0.657143 0.648649
5 Decision Tree 0.561151 0.600000 0.527027 0.564286 0.566544
6 Extra Trees 0.567376 0.597015 0.540541 0.564286 0.565725
7 Random Forest 0.596026 0.584416 0.608108 0.564286 0.561630
8 XGBoost 0.630872 0.626667 0.635135 0.607143 0.605446)
#=========================================================================
# stability + evolution + str features [13 = 6+3+4]
Model F1_Score Precision Recall Accuracy ROC_AUC
0 Logistic Regression 0.726115 0.686747 0.770270 0.692857 0.688165
1 Naive Bayes 0.730769 0.695122 0.770270 0.700000 0.695741
2 K-Nearest Neighbors 0.742515 0.666667 0.837838 0.692857 0.684070
3 SVM 0.763636 0.692308 0.851351 0.721429 0.713554
4 MLP 0.717949 0.682927 0.756757 0.685714 0.681409
5 Decision Tree 0.671429 0.712121 0.635135 0.671429 0.673628
6 Extra Trees 0.756410 0.719512 0.797297 0.728571 0.724406
7 Random Forest 0.742138 0.694118 0.797297 0.707143 0.701679
8 XGBoost 0.692810 0.670886 0.716216 0.664286 0.661138)
# stability + evolution [9=6+3]
Model F1_Score Precision Recall Accuracy ROC_AUC
0 Logistic Regression 0.729560 0.682353 0.783784 0.692857 0.687346
1 Naive Bayes 0.743590 0.707317 0.783784 0.714286 0.710074
2 K-Nearest Neighbors 0.720497 0.666667 0.783784 0.678571 0.672195
3 SVM 0.771084 0.695652 0.864865 0.728571 0.720311
4 MLP 0.679739 0.658228 0.702703 0.650000 0.646806
5 Decision Tree 0.620690 0.633803 0.608108 0.607143 0.607084
6 Extra Trees 0.727273 0.700000 0.756757 0.700000 0.696560
7 Random Forest 0.734177 0.690476 0.783784 0.700000 0.694922
8 XGBoost 0.675497 0.662338 0.689189 0.650000 0.647625)
# stability + str features [10=6+4]
Model F1_Score Precision Recall Accuracy ROC_AUC
0 Logistic Regression 0.750000 0.697674 0.810811 0.714286 0.708436
1 Naive Bayes 0.714286 0.687500 0.743243 0.685714 0.682228
2 K-Nearest Neighbors 0.687500 0.639535 0.743243 0.642857 0.636773
3 SVM 0.743902 0.677778 0.824324 0.700000 0.692465
4 MLP 0.716981 0.670588 0.770270 0.678571 0.673014
5 Decision Tree 0.616438 0.625000 0.608108 0.600000 0.599509
6 Extra Trees 0.697368 0.679487 0.716216 0.671429 0.668714
7 Random Forest 0.684211 0.666667 0.702703 0.657143 0.654382
8 XGBoost 0.666667 0.645570 0.689189 0.635714 0.632473)
# evolution + str features[7=3+4]
Model F1_Score Precision Recall Accuracy ROC_AUC
0 Logistic Regression 0.773006 0.707865 0.851351 0.735714 0.728706
1 Naive Bayes 0.750000 0.730769 0.770270 0.728571 0.726044
2 K-Nearest Neighbors 0.737500 0.686047 0.797297 0.700000 0.694103
3 SVM 0.763636 0.692308 0.851351 0.721429 0.713554
4 MLP 0.775758 0.703297 0.864865 0.735714 0.727887
5 Decision Tree 0.675497 0.662338 0.689189 0.650000 0.647625
6 Extra Trees 0.715232 0.701299 0.729730 0.692857 0.690622
7 Random Forest 0.715232 0.701299 0.729730 0.692857 0.690622
8 XGBoost 0.721519 0.678571 0.770270 0.685714 0.680590)

View file

@ -0,0 +1,156 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 3 17:08:18 2022
@author: tanu
"""
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import os
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score
import pandas as pd
#%%
homedir = os.path.expanduser("~")
os.chdir(homedir + "/git/ML_AI_training/test_data")
# this needs to be merged_df2 or merged_df3?
#gene 'pncA'
drug = 'pyrazinamide'
my_df = pd.read_csv("pnca_merged_df3.csv")
my_df.dtypes
my_df_cols = my_df.columns
#%%============================================================================
# GET Y
# Y = my_df.loc[:,drug] #has NA
dm_om_map = {'DM': 1, 'OM': 0}
my_df['resistance'] = my_df['mutation_info_labels'].map(dm_om_map)
# sanity check
my_df['resistance'].value_counts()
my_df['mutation_info_labels'].value_counts()
Y = my_df['resistance']
# GET X
cols = my_df.columns
X_stability = my_df[['ligand_distance'
, 'ligand_affinity_change'
, 'duet_stability_change'
, 'ddg_foldx'
, 'deepddg'
, 'ddg_dynamut2']]
X_evol = my_df[['consurf_score'
, 'snap2_score'
, 'snap2_accuracy_pc']]
X_str = my_df[['asa'
, 'rsa'
, 'kd_values'
, 'rd_values']]
#%% try combinations
X_vars = X_stability
X_vars = X_evol
X_vars = X_str
X_vars = pd.concat([X_stability, X_evol, X_str], axis = 1)
X_vars = pd.concat([X_stability, X_evol], axis = 1)
X_vars = pd.concat([X_stability, X_str], axis = 1)
X_vars = pd.concat([X_evol, X_str], axis = 1)
#%%
X_vars.shape[1]
# TODO: stratified cross validate
# Train-test Split
rs = {'random_state': 42}
X_train, X_test, y_train, y_test = train_test_split(X_vars,
Y,
test_size = 0.33,
random_state = 42)
# Classification - Model Pipeline
def modelPipeline(X_train, X_test, y_train, y_test):
log_reg = LogisticRegression(**rs)
nb = BernoulliNB()
knn = KNeighborsClassifier()
svm = SVC(**rs)
mlp = MLPClassifier(max_iter=500, **rs)
dt = DecisionTreeClassifier(**rs)
et = ExtraTreesClassifier(**rs)
rf = RandomForestClassifier(**rs)
xgb = XGBClassifier(**rs, verbosity=0)
clfs = [
('Logistic Regression', log_reg),
('Naive Bayes', nb),
('K-Nearest Neighbors', knn),
('SVM', svm),
('MLP', mlp),
('Decision Tree', dt),
('Extra Trees', et),
('Random Forest', rf),
('XGBoost', xgb)
]
pipelines = []
scores_df = pd.DataFrame(columns=['Model', 'F1_Score', 'Precision', 'Recall', 'Accuracy', 'ROC_AUC'])
for clf_name, clf in clfs:
pipeline = Pipeline(steps=[
('scaler', StandardScaler()),
('classifier', clf)
]
)
pipeline.fit(X_train, y_train)
# Model predictions
y_pred = pipeline.predict(X_test)
# F1-Score
fscore = f1_score(y_test, y_pred)
# Precision
pres = precision_score(y_test, y_pred)
# Recall
rcall = recall_score(y_test, y_pred)
# Accuracy
accu = accuracy_score(y_test, y_pred)
# ROC_AUC
roc_auc = roc_auc_score(y_test, y_pred)
pipelines.append(pipeline)
scores_df = scores_df.append({
'Model' : clf_name,
'F1_Score' : fscore,
'Precision' : pres,
'Recall' : rcall,
'Accuracy' : accu,
'ROC_AUC' : roc_auc
},
ignore_index = True)
return pipelines, scores_df
modelPipeline(X_train, X_test, y_train, y_test)

View file

@ -0,0 +1,81 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 3 17:08:18 2022
@author: tanu
"""
#%% load packages
import sys, os
import pandas as pd
from pandas import DataFrame
import numpy as np
import argparse
from functools import reduce
#%%
homedir = os.path.expanduser("~")
os.chdir(homedir + "/git/ML_AI_training/test_data")
#gene = ''
#drug = ''
#==============
# directories
#==============
datadir = homedir + '/git/Data/'
indir = datadir + drug + '/input/'
outdir = datadir + drug + '/output/'
# gene_baiscL = ['pnca']
# geneL_naL = ['gid', 'rpob']
# geneL_ppi2L = ['alr', 'embb', 'katg', 'rpob']
#=======
# input
#=======
infile_ml1 = outdir + gene.lower() + '_merged_df3.csv'
#infile_ml2 = outdir + gene.lower() + '_merged_df2.csv'
my_df = pd.read_csv(infile_ml1)
my_df.dtypes
my_df_cols = my_df.columns
#%%============================================================================
# GET Y
drug_labels = drug + '_labels'
drug_labels
my_df[drug_labels] = my_df[drug]
my_df[drug_labels].value_counts()
my_df[drug_labels] = my_df[drug].map({1: 'resistant', 0: 'sensitive'})
my_df[drug_labels].value_counts()
my_df[drug_labels] = my_df[drug_labels].fillna('unknown')
my_df[drug_labels].value_counts()
mutC = my_df[[ 'mutationinformation']].count()
target1C = my_df['mutation_info_labels'].value_counts()
target2C = my_df[drug_labels].value_counts()
#target2C.index = target2C.index.to_series().map({1: 'resistant', 0: 'sensitive'})
target3C = my_df['drtype'].value_counts()
targetsC = pd.concat([mutC, target1C, target2C, target3C])
targetsC
# targetsC2 = pd.concat([mutC, target1C, target2C
# #, target3C
# ], axis = 1)
# targetsC2
#%% try combinations
# X_vars = X_stability
# X_vars = X_evol
# X_vars = X_str
# X_vars = pd.concat([X_stability, X_evol, X_str], axis = 1)
# X_vars = pd.concat([X_stability, X_evol], axis = 1)
# X_vars = pd.concat([X_stability, X_str], axis = 1)
# X_vars = pd.concat([X_evol, X_str], axis = 1)

View file

@ -0,0 +1,212 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Feb 24 10:48:10 2022
@author: tanu
"""
###############################################################################
# questions:
# which data to use: merged_df3 or merged_df2
# which is the target? or_mychisq or drtype col
# scaling: can it be from -1 to 1?
# strategy:
# available data = X_train
# available data but NAN = validation_test
# test data: mut generated not in mcsm
###############################################################################
import os, sys
import re
from sklearn.datasets import load_boston
from sklearn import linear_model
from sklearn import preprocessing
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
print(np.__version__)
print(pd.__version__)
#%% read data
homedir = os.path.expanduser("~")
os.chdir(homedir + "/git/ML_AI_training/test_data")
# this needs to be merged_df2 or merged_df3?
my_df = pd.read_csv("pnca_all_params.csv")
my_df.dtypes
my_df_cols = my_df.columns
omit_cols1 = ['pdb_file'
, 'seq_offset4pdb'
, 'mut_3upper'
, 'wild_pos'
, 'wild_chain_pos'
, 'chain'
, 'wt_3upper'
, 'consurf_colour'
, 'consurf_colour_rev'
, 'consurf_msa_data'
, 'consurf_aa_variety'
, 'snap2_accuracy_pc'
, 'beta_logistic'
, 'se_logistic'
, 'zval_logisitc'
, 'pval_chisq'
, 'log10_or_mychisq'
, 'neglog_pval_fisher'
, 'wild_type'
, 'mutant_type'
, 'position'
, 'ligand_id'
, 'mutation'
, 'ss'
, 'ss_class' # include it later?
]
omit_cols2 = list(my_df.columns[my_df.columns.str.contains(".*ci_.*") | my_df.columns.str.contains(".*_scaled*") | my_df.columns.str.contains(".*_outcome*")])
# [WATCH:] just to test since these have negative values!
omit_cols3 = list(my_df.columns[my_df.columns.str.contains("electro_.*") | my_df.columns.str.contains("disulfide_.*") | my_df.columns.str.contains("hbonds_.*") | my_df.columns.str.contains("partcov_.*") | my_df.columns.str.contains("vdwclashes.*") | my_df.columns.str.contains("volumetric.*")])
omit_cols = omit_cols1 + omit_cols2 + omit_cols3
my_df_filt = my_df.loc[:, ~my_df.columns.isin(omit_cols)]
my_df_filt_cols = my_df_filt.columns
foo = my_df_filt['or_mychisq'].value_counts()
foo = foo.to_frame()
########################
# [WATCH]: Drop na
my_df2 = my_df_filt.dropna()
my_df2['resistance'] = my_df2['or_mychisq'].apply(lambda x: 0 if x <=1 else 1)
my_df2['resistance'].value_counts()
y = my_df2['resistance']
#==============================================================================
omit_cols_y = ['or_mychisq', 'resistance']
my_df_ml = my_df2.loc[:, ~my_df2.columns.isin(omit_cols_y)]
#%%############################################################################
X_train = my_df_ml.set_index('mutationinformation')
X_train = X_train.iloc[:,:4]
y_train = y
#X_train = X_train.dropna()
#y_train = y.dropna()
# check dim
X_train.shape
y_train.shape
#%%=====================================================
grid = sns.PairGrid(data = pd.concat([X_train
, pd.Series(y_train , name = "resistance")]
, axis = 1))
grid.map_offdiag(sns.scatterplot)
grid.map_diag(sns.distplot)
plt.show()
model = linear_model.LinearRegression()
model.fit(X_train, y_train)
###################
# test set
X_test = my_df[my_df['or_mychisq'].isnull()]
#X_test =[ X_test.iloc[:,:4]]
# HARD part?
# what should be the test set?
X_test = [23.9, 0.69, -0.16, 0.59
, 5, 0.5, 0.4, -1
, 0.1, 1, 1, 1]
X_test_re = np.array(X_test).reshape(3, -1)
####################
fitted = model.predict(X_train)
model.coef_
model.predict(X_test_re)
resid = y_train - fitted
resid
#####################
from sklearn import preprocessing
scaler = preprocessing.MinMaxScaler()
scaler.fit(X_train)
#We can then create a scaled training set
X_train_scaled = scaler.transform(X_train)
new_scaled = scaler.transform(X_test_re)
model.predict(new_scaled)
#########
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
# model_pipe = Pipeline(steps = [('preprocess', preprocessing.MinMaxScaler())
# ,('regression', linear_model.LinearRegression())
# ])
model_pipe = Pipeline(steps = [('preprocess', preprocessing.MinMaxScaler())
,('logis', LogisticRegression(class_weight = 'balanced'))
])
model_pipe.fit(X_train,y_train)
fitted_vals = model_pipe.predict(X_train)
# gives the array of predictions
model_pipe.predict(X_test_re)
# for Linear Reg only
# resid = y_train - fitted_vals
# resid
#=====
# Logistic 1 test
# FAILS since: the test set dim and input dim should be the same
# i.e if you give the model 10 features to train on, you will need
# 10 features to predict something?
# THINK!!!!
#=====
mod_logis = linear_model.LogisticRegression(class_weight = 'balanced')
mod_logis.fit(X_train,y_train)
X_test = [23.9]
X_test_re = np.array(X_test).reshape(1, -1)
mod_logis.predict(X_test_re)
#################
from sklearn.metrics import accuracy_score, precision_score, recall_score
y_pred = model_pipe.predict(X_train)
accuracy_score(y_train,y_pred)
precision_score(y_train,y_pred,pos_label=1)# tp/(tp + fp)
recall_score(y_train,y_pred,pos_label=1) # tp/(tp + fn)
########
# WORKS!
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer
import pandas as pd
acc = make_scorer(accuracy_score)
def precision(y_true,y_pred):
return precision_score(y_true,y_pred,pos_label = 1) #0
def recall(y_true,y_pred):
return recall_score(y_true, y_pred, pos_label = 1) #0
prec = make_scorer(precision)
rec = make_scorer(recall)
output = cross_validate(model_pipe
, X_train
, y_train
, scoring = {'acc' : acc
,'prec' : prec
,'rec' : rec}
, cv = 10, return_train_score = False)
pd.DataFrame(output).mean()
# fit_time 0.005486
# score_time 0.002673
# test_acc 0.601799
# test_prec 0.976936
# test_rec 0.603226
# dtype: float64
# the three scores
# 0.65527950310559
# 0.9853658536585366
# 0.6516129032258065

View file

@ -0,0 +1,272 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Mar 5 12:57:32 2022
@author: tanu
"""
#%%
# Data, etc for now comes from my_data6.py and/or my_data5.py
#%% Specify dir and import functions
homedir = os.path.expanduser("~")
os.chdir(homedir + "/git/ML_AI_training/")
#%% Try combinations
#import sys, os
#os.system("imports.py")
def precision(y_true,y_pred):
return precision_score(y_true,y_pred,pos_label = 1)
def recall(y_true,y_pred):
return recall_score(y_true, y_pred, pos_label = 1)
def f1(y_true,y_pred):
return f1_score(y_true, y_pred, pos_label = 1)
#%% Check df features
numerical_features_df.shape
categorical_features_df.shape
all_features_df.shape
all_features_df.dtypes
#%% Simple train and test data splits
target = target1
#target = target3
X_trainN, X_testN, y_trainN, y_testN = train_test_split(numerical_features_df,
target,
test_size = 0.33,
random_state = 42)
X_trainC, X_testC, y_trainC, y_testC = train_test_split(categorical_features_df,
target,
test_size = 0.33,
random_state = 42)
X_train, X_test, y_train, y_test = train_test_split(all_features_df,
target,
test_size = 0.33,
random_state = 42)
#%% Stratified K-fold: Single model
model1 = Pipeline(steps = [('preprocess', MinMaxScaler())
, ('log_reg', LogisticRegression(class_weight = 'balanced')) ])
model1
rs = {'random_state': 42}
log_reg = LogisticRegression(**rs)
nb = BernoulliNB()
clfs = [('Logistic Regression', log_reg)
,('Naive Bayes', nb)]
seed_skf = 42
skf = StratifiedKFold(n_splits = 10
, shuffle = True
, random_state = seed_skf)
X_array = np.array(numerical_features_df)
Y = target1
model_scores_df = pd.DataFrame()
fscoreL = []
mccL = []
presL = []
recallL = []
accuL = []
roc_aucL = []
for train_index, test_index in skf.split(X_array, Y):
x_train_fold, x_test_fold = X_array[train_index], X_array[test_index]
y_train_fold, y_test_fold = Y[train_index], Y[test_index]
model1.fit(x_train_fold, y_train_fold)
y_pred_fold = model1.predict(x_test_fold)
#----------------
# Model metrics
#----------------
# F1-Score
fscore = f1_score(y_test_fold, y_pred_fold)
fscoreL.append(fscore)
fscoreM = mean(fscoreL)
# Matthews correlation coefficient
mcc = matthews_corrcoef(y_test_fold, y_pred_fold)
mccL.append(mcc)
mccM = mean(mccL)
# Precision
pres = precision_score(y_test_fold, y_pred_fold)
presL.append(pres)
presM = mean(presL)
# Recall
recall = recall_score(y_test_fold, y_pred_fold)
recallL.append(recall)
recallM = mean(recallL)
# Accuracy
accu = accuracy_score(y_test_fold, y_pred_fold)
accuL.append(accu)
accuM = mean(accuL)
# ROC_AUC
roc_auc = roc_auc_score(y_test_fold, y_pred_fold)
roc_aucL.append(roc_auc)
roc_aucM = mean(roc_aucL)
model_scores_df = model_scores_df.append({'Model' : model1.steps[1][0]
,'F1_score' : fscoreM
, 'MCC' : mccM
, 'Precision': presM
, 'Recall' : recallM
, 'Accuracy' : accuM
, 'ROC_curve': roc_aucM}
, ignore_index = True)
print('\nModel metrics:', model_scores_df)
#%% stratified KFold: Multiple_models:
input_df = numerical_features_df
#X_array = np.array(input_df)
Y = target1
var_type = 'numerical'
input_df = all_features_df
#X_array = np.array(input_df)
Y = target1
var_type = 'mixed'
input_df = categorical_features_df
#X_array = np.array(input_df)
Y = target1
var_type = 'categorical'
#=================
numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
numerical_ix
categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns
categorical_ix
# Determine preprocessing steps ~ var_type
if var_type == 'numerical':
t = [('num', MinMaxScaler(), numerical_ix)]
if var_type == 'categorical':
t = [('cat', OneHotEncoder(), categorical_ix)]
if var_type == 'mixed':
t = [('cat', OneHotEncoder(), categorical_ix)
, ('num', MinMaxScaler(), numerical_ix)]
##############################
col_transform = ColumnTransformer(transformers = t
, remainder='passthrough')
rs = {'random_state': 42}
#log_reg = LogisticRegression(**rs)
log_reg = LogisticRegression(class_weight = 'balanced')
nb = BernoulliNB()
rf = RandomForestClassifier(**rs)
clfs = [('Logistic Regression', log_reg)
#,('Naive Bayes', nb)
#, ('Random Forest' , rf)
]
#seed_skf = 42
skf = StratifiedKFold(n_splits = 10
, shuffle = True
#, random_state = seed_skf
, **rs)
#scores_df = pd.DataFrame()
fscoreL = []
mccL = []
presL = []
recallL = []
accuL = []
roc_aucL = []
for train_index, test_index in skf.split(input_df, Y):
print('\nSKF train index:', train_index
, '\nSKF test index:', test_index)
x_train_fold, x_test_fold = input_df.iloc[train_index], input_df.iloc[test_index]
y_train_fold, y_test_fold = Y.iloc[train_index], Y.iloc[test_index]
# for train_index, test_index in skf.split(X_array, Y):
# print('\nSKF train index:', train_index
# , '\nSKF test index:', test_index)
# x_train_fold, x_test_fold = X_array[train_index], X_array[test_index]
# y_train_fold, y_test_fold = Y[train_index], Y[test_index]
clf_scores_df = pd.DataFrame()
for clf_name, clf in clfs:
# model2 = Pipeline(steps=[('preprocess', MinMaxScaler())
# , ('classifier', clf)])
model2 = Pipeline(steps=[('preprocess', col_transform)
, ('classifier', clf)])
model2.fit(x_train_fold, y_train_fold)
y_pred_fold = model2.predict(x_test_fold)
#----------------
# Model metrics
#----------------
# F1-Score
fscore = f1_score(y_test_fold, y_pred_fold)
fscoreL.append(fscore)
fscoreM = mean(fscoreL)
# Matthews correlation coefficient
mcc = matthews_corrcoef(y_test_fold, y_pred_fold)
mccL.append(mcc)
mccM = mean(mccL)
# Precision
pres = precision_score(y_test_fold, y_pred_fold)
presL.append(pres)
presM = mean(presL)
# Recall
recall = recall_score(y_test_fold, y_pred_fold)
recallL.append(recall)
recallM = mean(recallL)
# Accuracy
accu = accuracy_score(y_test_fold, y_pred_fold)
accuL.append(accu)
accuM = mean(accuL)
# ROC_AUC
roc_auc = roc_auc_score(y_test_fold, y_pred_fold)
roc_aucL.append(roc_auc)
roc_aucM = mean(roc_aucL)
clf_scores_df = clf_scores_df.append({'Model': clf_name
,'F1_score' : fscoreM
, 'MCC' : mccM
, 'Precision': presM
, 'Recall' : recallM
, 'Accuracy' : accuM
, 'ROC_curve': roc_aucM}
, ignore_index = True)
#scores_df = scores_df.append(clf_scores_df)
#%% Call functions
tN_res = MultClassPipeline(X_trainN, X_testN, y_trainN, y_testN)
tN_res
t2_res = MultClassPipeline2(X_train, X_test, y_train, y_test, input_df = all_features_df)
t2_res
#CHECK: numbers are awfully close to each other!
t3_res = MultClassPipeSKF(input_df = numerical_features_df
, y_targetF = target1
, var_type = 'numerical'
, skf_splits = 10)
t3_res
#CHECK: numbers are awfully close to each other!
t4_res = MultClassPipeSKF(input_df = all_features_df
, y_targetF = target1
, var_type = 'mixed'
, skf_splits = 10)
t4_res

View file

@ -0,0 +1,195 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Mar 5 12:57:32 2022
@author: tanu
"""
#%%
# Data, etc for now comes from my_data6.py and/or my_data5.py
#%% Specify dir and import functions
homedir = os.path.expanduser("~")
os.chdir(homedir + "/git/ML_AI_training/")
#%% Try combinations
#import sys, os
#os.system("imports.py")
def precision(y_true,y_pred):
return precision_score(y_true,y_pred,pos_label = 1)
def recall(y_true,y_pred):
return recall_score(y_true, y_pred, pos_label = 1)
def f1(y_true,y_pred):
return f1_score(y_true, y_pred, pos_label = 1)
#%% Check df features
numerical_features_df.shape
categorical_features_df.shape
all_features_df.shape
all_features_df.dtypes
#%% Simple train and test data splits
target = target1
#target = target3
X_trainN, X_testN, y_trainN, y_testN = train_test_split(numerical_features_df,
target,
test_size = 0.33,
random_state = 42)
X_trainC, X_testC, y_trainC, y_testC = train_test_split(categorical_features_df,
target,
test_size = 0.33,
random_state = 42)
X_train, X_test, y_train, y_test = train_test_split(all_features_df,
target,
test_size = 0.33,
random_state = 42)
#%% Stratified K-fold: Single model
input_df = numerical_features_df
#X_array = np.array(input_df)
var_type = 'numerical'
input_df = all_features_df
#X_array = np.array(input_df)
var_type = 'mixed'
input_df = categorical_features_df
#X_array = np.array(input_df)
var_type = 'categorical'
y_targetF = target1
#==============================================================================
numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
numerical_ix
categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns
categorical_ix
# Determine preprocessing steps ~ var_type
if var_type == 'numerical':
t = [('num', MinMaxScaler(), numerical_ix)]
if var_type == 'categorical':
t = [('cat', OneHotEncoder(), categorical_ix)]
if var_type == 'mixed':
t = [('cat', OneHotEncoder(), categorical_ix)
, ('num', MinMaxScaler(), numerical_ix)]
###############################################################################
col_transform = ColumnTransformer(transformers = t
, remainder='passthrough')
###############################################################################
rs = {'random_state': 42}
del(model1)
model1 = Pipeline(steps = [('preprocess', MinMaxScaler())
, ('log_reg', LogisticRegression(class_weight = 'unbalanced')) ])
# model1 = Pipeline(steps = [('preprocess', MinMaxScaler())
# , ('log_reg', LogisticRegression(**rs)) ])
del(model1)
nb = BernoulliNB()
model1 = Pipeline(steps = [('preprocess', MinMaxScaler())
, ('nb', nb) ])
del(model1)
knn = KNeighborsClassifier()
model1 = Pipeline(steps = [('preprocess', MinMaxScaler())
, ('knn', knn) ])
del(model1)
svm = SVC(**rs)
model1 = Pipeline(steps = [('preprocess', MinMaxScaler())
, ('svm', svm) ])
del(model1)
mlp = MLPClassifier(max_iter = 500, **rs)
model1 = Pipeline(steps = [('preprocess', MinMaxScaler())
, ('mlp', mlp) ])
del(model1)
dt = DecisionTreeClassifier(**rs)
model1 = Pipeline(steps = [('preprocess', MinMaxScaler())
, ('dt', dt) ])
del(model1)
et = ExtraTreesClassifier(**rs)
model1 = Pipeline(steps = [('preprocess', MinMaxScaler())
, ('et', et) ])
del(model1)
rf = RandomForestClassifier(**rs)
model1 = Pipeline(steps = [('preprocess', MinMaxScaler())
, ('rf', rf) ])
###############################################################################
#%% run
del(mm)
skf = StratifiedKFold(n_splits = 10
, shuffle = True
, **rs)
#X_array = np.array(numerical_features_df)
#Y = target1
model_scores_df = pd.DataFrame()
fscoreL = []
mccL = []
presL = []
recallL = []
accuL = []
roc_aucL = []
# for train_index, test_index in skf.split(X_array, Y):
# x_train_fold, x_test_fold = X_array[train_index], X_array[test_index]
# y_train_fold, y_test_fold = Y[train_index], Y[test_index]
for train_index, test_index in skf.split(input_df, y_targetF):
x_train_fold, x_test_fold = input_df.iloc[train_index], input_df.iloc[test_index]
y_train_fold, y_test_fold = y_targetF.iloc[train_index], y_targetF.iloc[test_index]
model1.fit(x_train_fold, y_train_fold)
y_pred_fold = model1.predict(x_test_fold)
#----------------
# Model metrics
#----------------
# F1-Score
fscore = f1_score(y_test_fold, y_pred_fold)
fscoreL.append(fscore)
fscoreM = mean(fscoreL)
# Matthews correlation coefficient
mcc = matthews_corrcoef(y_test_fold, y_pred_fold)
mccL.append(mcc)
mccM = mean(mccL)
# Precision
pres = precision_score(y_test_fold, y_pred_fold)
presL.append(pres)
presM = mean(presL)
# Recall
recall = recall_score(y_test_fold, y_pred_fold)
recallL.append(recall)
recallM = mean(recallL)
# Accuracy
accu = accuracy_score(y_test_fold, y_pred_fold)
accuL.append(accu)
accuM = mean(accuL)
# ROC_AUC
roc_auc = roc_auc_score(y_test_fold, y_pred_fold)
roc_aucL.append(roc_auc)
roc_aucM = mean(roc_aucL)
model_scores_df = model_scores_df.append({'Model' : model1.steps[1][0]
,'F1_score' : fscoreM
, 'MCC' : mccM
, 'Precision': presM
, 'Recall' : recallM
, 'Accuracy' : accuM
, 'ROC_curve': roc_aucM}
, ignore_index = True)
print('\nModel metrics:\n', model_scores_df)
mm = model_scores_df.mean()
print('\nModel metrics mean:\n', mm)
print('\nModel metrics:\n', model_scores_df)

View file

@ -0,0 +1,179 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Feb 24 10:48:10 2022
@author: tanu
"""
###############################################################################
# questions:
# which data to use: merged_df3 or merged_df2
# which is the target? or_mychisq or drtype col
# scaling: can it be from -1 to 1?
# how to include the mutation information?
# 'wild_type', 'mutant', 'postion'
# whether to log transform the af and or cols
# to allow mean mode values to be imputed for validation set
# whether to calculate mean, median accounting for NA or removing them?
# strategy:
# available data = X_train
# available data but NAN = validation_test
# test data: mut generated not in mcsm
###############################################################################
import os, sys
import re
from sklearn.datasets import load_boston
from sklearn import linear_model
from sklearn import preprocessing
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
print(np.__version__)
print(pd.__version__)
#%% read data
homedir = os.path.expanduser("~")
os.chdir(homedir + "/git/ML_AI_training/test_data")
# this needs to be merged_df2 or merged_df3?
my_df = pd.read_csv("pnca_all_params.csv")
my_df.dtypes
my_df_cols = my_df.columns
omit_cols1 = ['pdb_file'
, 'seq_offset4pdb'
, 'mut_3upper'
, 'wild_pos'
, 'wild_chain_pos'
, 'chain'
, 'wt_3upper'
, 'consurf_colour'
, 'consurf_colour_rev'
, 'consurf_msa_data'
, 'consurf_aa_variety'
, 'snap2_accuracy_pc'
, 'beta_logistic'
, 'se_logistic'
, 'zval_logisitc'
, 'pval_chisq'
, 'log10_or_mychisq'
, 'neglog_pval_fisher'
, 'or_fisher'
, 'wild_type'
, 'mutant_type'
, 'position'
, 'ligand_id'
, 'mutation'
, 'ss'
, 'ss_class' # include it later?
, 'contacts'
]
omit_cols2 = list(my_df.columns[my_df.columns.str.contains(".*ci_.*") | my_df.columns.str.contains(".*_scaled*") | my_df.columns.str.contains(".*_outcome*")])
# [WATCH:] just to test since these have negative values!
omit_cols3 = list(my_df.columns[my_df.columns.str.contains("electro_.*") | my_df.columns.str.contains("disulfide_.*") | my_df.columns.str.contains("hbonds_.*") | my_df.columns.str.contains("partcov_.*") | my_df.columns.str.contains("vdwclashes.*") | my_df.columns.str.contains("volumetric.*")])
omit_cols = omit_cols1 + omit_cols2 + omit_cols3
my_df_filt = my_df.loc[:, ~my_df.columns.isin(omit_cols)]
my_df_filt_cols = my_df_filt.columns
#fill NaNs with column means in each column
my_df_filt2 = my_df_filt.fillna(my_df_filt.mean())
my_df_filt3 = my_df_filt.fillna(my_df_filt.median())
my_df_filt_noNA = my_df_filt.fillna(0)
summ = my_df_filt.describe()
summ_noNA = my_df_filt_noNA.describe()
foo = my_df_filt['or_mychisq'].value_counts()
foo = foo.to_frame()
########################
# [WATCH]: Drop na
my_df2 = my_df_filt3.dropna()
my_df2['resistance'] = my_df2['or_mychisq'].apply(lambda x: 0 if x <=1 else 1)
my_df2['resistance'].value_counts()
y = my_df2['resistance']
y.value_counts()
#%%============================================================================
X_validation_muts = my_df['mutationinformation'][~my_df['mutationinformation'].isin(my_df2['mutationinformation'])]
X_validation_all = my_df_filt3[~my_df_filt3['mutationinformation'].isin(my_df2['mutationinformation'])]
X_validation_f = X_validation_all.loc[:, ~X_validation_all.columns.isin(['or_mychisq', 'resistance'])]
X_validation = X_validation_f.set_index('mutationinformation')
#%% fill na in cols with mean value
X_validation.info()
X_validation.isna().any()
na_df = X_validation_f[X_validation_f.columns[X_validation_f.isna().any()]]
na_colnames = X_validation_f.columns[X_validation_f.isna().any()]
na_colsL = list(na_colnames)
#==============================================================================
omit_cols_y = ['or_mychisq', 'resistance']
my_df_ml = my_df2.loc[:, ~my_df2.columns.isin(omit_cols_y)]
#%%############################################################################
X_train = my_df_ml.set_index('mutationinformation')
#X_train = X_train.iloc[:,:4]
y_train = y
#X_train = X_train.dropna()
#y_train = y.dropna()
# check dim
X_train.shape
y_train.shape
###############################################################################
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score, precision_score, recall_score
model_logisP = Pipeline(steps = [('preprocess', preprocessing.MinMaxScaler())
, ('logis', LogisticRegression(class_weight = 'unbalanced'))
])
model_logisP.fit(X_train, y_train)
fitted_vals = model_logisP.predict(X_train)
fitted_vals
# gives the array of predictions
model_logisP.predict(X_train)
model_logisP.predict(X_validation)
y_pred = model_logisP.predict(X_train)
y_pred2 = model_logisP.predict(X_validation)
accuracy_score(y_train,y_pred2)
precision_score(y_train,y_pred2, pos_label = 1)# tp/(tp + fp)
recall_score(y_train,y_pred2, pos_label = 1) # tp/(tp + fn)
################
acc = make_scorer(accuracy_score)
def precision(y_true,y_pred):
return precision_score(y_true,y_pred,pos_label = 1) #0
def recall(y_true,y_pred):
return recall_score(y_true, y_pred, pos_label = 1) #0
prec = make_scorer(precision)
rec = make_scorer(recall)
output = cross_validate(model_logisP
, X_train
, y
, scoring = {'acc' : acc
,'prec' : prec
,'rec' : rec}
, cv = 10, return_train_score = False)
pd.DataFrame(output).mean()

View file

@ -0,0 +1,376 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Feb 24 10:48:10 2022
@author: tanu
"""
###############################################################################
# questions:
# which data to use: merged_df3 or merged_df2
# which is the target? or_mychisq or drtype col
# scaling: can it be from -1 to 1?
# how to include the mutation information?
# 'wild_type', 'mutant', 'postion'
# whether to log transform the af and or cols
# to allow mean mode values to be imputed for validation set
# whether to calculate mean, median accounting for NA or removing them?
# strategy:
# available data = X_train
# available data but NAN = validation_test
# test data: mut generated not in mcsm
###############################################################################
import os, sys
import re
from sklearn.datasets import load_boston
from sklearn import datasets
from sklearn import linear_model
from sklearn import preprocessing
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
print(np.__version__)
print(pd.__version__)
from statistics import mean, stdev
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score
#%% read data
homedir = os.path.expanduser("~")
os.chdir(homedir + "/git/ML_AI_training/test_data")
# this needs to be merged_df2 or merged_df3?
my_df = pd.read_csv("pnca_all_params.csv")
my_df.dtypes
my_df_cols = my_df.columns
omit_cols1 = ['pdb_file'
, 'seq_offset4pdb'
, 'mut_3upper'
, 'wild_pos'
, 'wild_chain_pos'
, 'chain'
, 'wt_3upper'
, 'consurf_colour'
, 'consurf_colour_rev'
, 'consurf_msa_data'
, 'consurf_aa_variety'
, 'snap2_accuracy_pc'
, 'beta_logistic'
, 'se_logistic'
, 'zval_logisitc'
, 'pval_chisq'
, 'log10_or_mychisq'
, 'neglog_pval_fisher'
, 'or_fisher'
, 'wild_type'
, 'mutant_type'
, 'position'
, 'ligand_id'
, 'mutation'
, 'ss'
, 'ss_class' # include it later?
, 'contacts'
]
omit_cols2 = list(my_df.columns[my_df.columns.str.contains(".*ci_.*") | my_df.columns.str.contains(".*_scaled*") | my_df.columns.str.contains(".*_outcome*")])
# [WATCH:] just to test since these have negative values!
omit_cols3 = list(my_df.columns[my_df.columns.str.contains("electro_.*") | my_df.columns.str.contains("disulfide_.*") | my_df.columns.str.contains("hbonds_.*") | my_df.columns.str.contains("partcov_.*") | my_df.columns.str.contains("vdwclashes.*") | my_df.columns.str.contains("volumetric.*")])
omit_cols = omit_cols1 + omit_cols2 + omit_cols3
# Filter df: Filter columns to focus on my selected ones
my_df_filt = my_df.loc[:, ~my_df.columns.isin(omit_cols)]
my_df_filt_cols = my_df_filt.columns
#Fill na of filtered df: fill NaNs with column means/medians in each column
my_df_filt2 = my_df_filt.fillna(my_df_filt.mean())
my_df_filt3 = my_df_filt.fillna(my_df_filt.median())
#my_df_filt_noNA = my_df_filt.fillna(0)
summ = my_df_filt.describe()
summ2 = my_df_filt2.describe()
summ3 = my_df_filt3.describe()
#summ_noNA = my_df_filt_noNA.describe()
########################
# [WATCH]: Drop na
# Get Y
my_df2 = my_df_filt.dropna()
my_df2['resistance'] = my_df2['or_mychisq'].apply(lambda x: 0 if x <=1 else 1)
my_df2['resistance'].value_counts()
Y = my_df2['resistance']
Y = np.array(Y)
#Y = Y.reset_index()
#Y = Y.drop(['index'], axis = 1)
#Y.value_counts()
#Y = np.array(Y)
# GET X
omit_cols_y = ['or_mychisq', 'resistance']
my_df_ml = my_df2.loc[:, ~my_df2.columns.isin(omit_cols_y)]
#my_df_ml = my_df_ml.set_index('mutationinformation')
X = my_df_ml
X = X.drop(['mutationinformation'], axis = 1)
X = np.array(X)
#X = X.reset_index()
# check dim
X.shape
Y.shape
my_df2 = my_df2.reset_index()
#####################
#https://stackoverflow.com/questions/49134338/kfolds-cross-validation-vs-train-test-split
rf = RandomForestClassifier(n_estimators=100, random_state=42)
#https://towardsdatascience.com/stratified-k-fold-what-it-is-how-to-use-it-cf3d107d3ea2
# k-FOLD
print('Class Ratio:',
sum(Y)/len(Y))
print('Class Ratio:',
sum(my_df2['resistance'])/len(my_df2['resistance'])
)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
target = my_df2.loc[:,'resistance']
fold_no = 1
for train_index, test_index in skf.split(my_df2, target):
train = my_df2.loc[train_index,:]
test = my_df2.loc[test_index,:]
print('Fold',str(fold_no),
'Class Ratio:',
sum(test['resistance'])/len(test['resistance']))
fold_no += 1
model_logisP = Pipeline(steps = [('preprocess', preprocessing.MinMaxScaler())
, ('logis', LogisticRegression(class_weight = 'unbalanced'))
])
X_features = X_train.columns.to_list()
def train_model(train, test, fold_no):
X = X_features
y = ['resistance']
X_train = train[X]
y_train = train[y]
X_test = test[X]
y_test = test[y]
model_logisP.fit(X_train,y_train)
predictions = model_logisP.predict(X_test)
print('Fold',str(fold_no),
'Accuracy:',
accuracy_score(y_test,predictions))
fold_no = 1
for train_index, test_index in skf.split(my_df2, target):
train = my_df2.loc[train_index,:]
test = my_df2.loc[test_index,:]
train_model(train,test,fold_no)
fold_no += 1
#%%
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=20)
lst_accu_stratified = []
scaler = preprocessing.MinMaxScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = X_scaled[:,[1,2,3,15,16]]
#lr = linear_model.LogisticRegression(class_weight = 'unbalanced')
lr = linear_model.LogisticRegression()
for train_index1, test_index1 in skf.split(X, Y):
#print(train_index)
#print(test_index)
x_train_fold1, x_test_fold1 = X_scaled[train_index1], X_scaled[test_index1]
y_train_fold1, y_test_fold1 = Y[train_index1], Y[test_index1]
lr.fit(x_train_fold1, y_train_fold1)
lst_accu_stratified.append(lr.score(x_test_fold1, y_test_fold1))
# print output
print('List of possible accuracy', lst_accu_stratified)
print('Max accuracy:', max(lst_accu_stratified)*100, "%")
print('Min accuracy:', min(lst_accu_stratified)*100, "%")
print('Mean accuracy:', mean(lst_accu_stratified)*100,"%")
print('St Dev:', stdev(lst_accu_stratified)*100,"%")
# cancer data
cancer = datasets.load_breast_cancer()
x = cancer.data
y = cancer.target
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
lst_accu_stratifiedC = []
scaler = preprocessing.MinMaxScaler()
x_scaled = scaler.fit_transform(x)
x_scaled = x_scaled[:,[1,2,3, 15, 16]]
for train_index, test_index in skf.split(x, y):
#print(train_index)
#print(test_index)
x_train_fold, x_test_fold = x_scaled[train_index], x_scaled[test_index]
y_train_fold, y_test_fold = y[train_index], y[test_index]
lr.fit(x_train_fold, y_train_fold)
lst_accu_stratifiedC.append(lr.score(x_test_fold, y_test_fold))
# print output
print('List of possible accuracy', lst_accu_stratifiedC)
print('Max accuracy:', max(lst_accu_stratifiedC)*100, "%")
print('Min accuracy:', min(lst_accu_stratifiedC)*100, "%")
print('Mean accuracy:', mean(lst_accu_stratifiedC)*100,"%")
print('St Dev:', stdev(lst_accu_stratifiedC)*100,"%")
#%%
##
# https://towardsdatascience.com/my-random-forest-classifier-cheat-sheet-in-python-fedb84f8cf4f
y_all = my_df_filt['or_mychisq'].apply(lambda x: 0 if x <=1 else 1)
X_all = my_df_filt.drop(['mutationinformation', 'or_mychisq'], axis = 1)
seed = 20 # so that the result is reproducible
X_all = my_df_filt.drop(['mutationinformation', 'or_mychisq'], axis = 1)
X_all = X_all.iloc[:,:6]
X_train, X_test, y_train, y_test = train_test_split(X_all,y_all
, test_size=0.333
, random_state = seed)
# Now, it is time to make NA a category.
# In Python, NaN is considered NAs.
# When encoded, those NaN will be ignored.
# Hence, it is useful to replace NaN with na, which is now a category called na.
# This will be taken into account when encoding later on.
#X_train = X_train.fillna('na')
#X_test = X_test.fillna('na')
X_train = X_train.fillna(X_train.median())
X_test = X_test.fillna(X_test.median())
X_train.dtypes
features_to_encode = list(X_train.select_dtypes(include = ['object']).columns)
col_trans = make_column_transformer(
(OneHotEncoder(),features_to_encode),
remainder = "passthrough"
)
rf_classifier = RandomForestClassifier(
min_samples_leaf=50,
n_estimators=150,
bootstrap=True,
oob_score=True,
n_jobs=-1,
random_state=seed,
max_features='auto')
pipe = make_pipeline(col_trans, rf_classifier)
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
accuracy_score(y_test, y_pred)
print(f"The accuracy of the model is {round(accuracy_score(y_test,y_pred),3)*100} %")
recall_score(y_test, y_pred)
precision_score(y_test, y_pred)
f1_score(y_test, y_pred)
roc_auc_score (y_test, y_pred)
roc_curve(y_test, y_pred)
train_probs = pipe.predict_proba(X_train)[:,1]
probs = pipe.predict_proba(X_test)[:, 1]
train_predictions = pipe.predict(X_train)
print(f'Train ROC AUC Score: {roc_auc_score(y_train, train_probs)}')
print(f'Test ROC AUC Score: {roc_auc_score(y_test, probs)}')
def evaluate_model(y_pred, probs,train_predictions, train_probs):
baseline = {}
baseline['recall']=recall_score(y_test,
[1 for _ in range(len(y_test))])
baseline['precision'] = precision_score(y_test,
[1 for _ in range(len(y_test))])
baseline['roc'] = 0.5
results = {}
results['recall'] = recall_score(y_test, y_pred)
results['precision'] = precision_score(y_test, y_pred)
results['roc'] = roc_auc_score(y_test, probs)
train_results = {}
train_results['recall'] = recall_score(y_train,
train_predictions)
train_results['precision'] = precision_score(y_train, train_predictions)
train_results['roc'] = roc_auc_score(y_train, train_probs)
# for metric in ['recall', 'precision', 'roc']:
# print(f"Baseline: {round(baseline[metric], 2)}Test: {round(results[metric], 2)} Train: {round(train_results[metric], 2)}")
# Calculate false positive rates and true positive rates
base_fpr, base_tpr, _ = roc_curve(y_test, [1 for _ in range(len(y_test))])
model_fpr, model_tpr, _ = roc_curve(y_test, probs)
plt.figure(figsize = (8, 6))
plt.rcParams['font.size'] = 16
# Plot both curves
plt.plot(base_fpr, base_tpr, 'b', label = 'baseline')
plt.plot(model_fpr, model_tpr, 'r', label = 'model')
plt.legend(); plt.xlabel('False Positive Rate');
plt.ylabel('True Positive Rate'); plt.title('ROC Curves');
plt.show()
# Recall Baseline: 1.0 Test: 0.92 Train: 0.93
# Precision Baseline: 0.48 Test: 0.9 Train: 0.91
# Roc Baseline: 0.5 Test: 0.97 Train: 0.97
evaluate_model(y_pred,probs,train_predictions,train_probs)
#%%
import itertools
def plot_confusion_matrix(cm, classes, normalize = False,
title='Confusion matrix',
cmap=plt.cm.Greens): # can change color
plt.figure(figsize = (10, 10))
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title, size = 24)
plt.colorbar(aspect=4)
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45, size = 14)
plt.yticks(tick_marks, classes, size = 14)
fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
# Label the plot
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): plt.text(j, i, format(cm[i, j], fmt),
fontsize = 20,
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.grid(None)
plt.tight_layout()
plt.ylabel('True label', size = 18)
plt.xlabel('Predicted label', size = 18)
# Let's plot it out
cm = confusion_matrix(y_test, y_pred)
plot_confusion_matrix(cm, classes = ['0 - Susceptible', '1 - Resistant'],
title = 'R/S Confusion Matrix')
print(rf_classifier.feature_importances_)
print(f" There are {len(rf_classifier.feature_importances_)} features in total")

View file

@ -0,0 +1,361 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Feb 24 10:48:10 2022
@author: tanu
"""
###############################################################################
# questions:
# which data to use: merged_df3 or merged_df2
# which is the target? or_mychisq or drtype col
# scaling: can it be from -1 to 1?
# how to include the mutation information?
# 'wild_type', 'mutant', 'postion'
# whether to log transform the af and or cols
# to allow mean mode values to be imputed for validation set
# whether to calculate mean, median accounting for NA or removing them?
# strategy:
# available data = X_train
# available data but NAN = validation_test
# test data: mut generated not in mcsm
###############################################################################
import os, sys
import re
from sklearn.datasets import load_boston
from sklearn import datasets
from sklearn import linear_model
from sklearn import preprocessing
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
print(np.__version__)
print(pd.__version__)
from statistics import mean, stdev
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score
from sklearn.metrics import plot_precision_recall_curve
import itertools
#%% read data
homedir = os.path.expanduser("~")
os.chdir(homedir + "/git/ML_AI_training/test_data")
# this needs to be merged_df2 or merged_df3?
#gene 'pncA'
drug = 'pyrazinamide'
my_df = pd.read_csv("pnca_merged_df3.csv")
my_df.dtypes
my_df_cols = my_df.columns
#%%
# GET Y
# Y = my_df.loc[:,drug] #has NA
dm_om_map = {'DM': 1, 'OM': 0}
my_df['resistance'] = my_df['mutation_info_labels'].map(dm_om_map)
# sanity check
my_df['resistance'].value_counts()
my_df['mutation_info_labels'].value_counts()
Y = my_df['resistance']
#%%
# GET X
cols = my_df.columns
X = my_df[['ligand_distance'
, 'ligand_affinity_change'
, 'duet_stability_change', 'ddg_foldx', 'deepddg', 'ddg_dynamut2', 'consurf_score'
, 'snap2_score'
#, 'snap2_accuracy_pc'
, 'asa'
, 'rsa']]
#%%
####################################
# SIMPLEST case of train_test split
# Random forest
# one hot encoder
# MinMaxScaler
# https://towardsdatascience.com/my-random-forest-classifier-cheat-sheet-in-python-fedb84f8cf4f
####################################
seed = 50
X_train, X_test, y_train, y_test = train_test_split(X,Y
, test_size = 0.333
, random_state = seed)
features_to_encode = list(X_train.select_dtypes(include = ['object']).columns)
col_trans = make_column_transformer(
(OneHotEncoder(),features_to_encode),
remainder = "passthrough"
)
MinMaxS = preprocessing.MinMaxScaler()
standardS = preprocessing.StandardScaler()
rf_classifier = RandomForestClassifier(
min_samples_leaf=50,
n_estimators=150,
bootstrap=True,
oob_score=True,
n_jobs=-1,
random_state=seed,
max_features='auto')
pipe = make_pipeline(col_trans
#, MinMaxS
#, standardS
, rf_classifier)
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
accuracy_score(y_test, y_pred)
print("\nModel evaluation:\n")
print(f"Accuracy: {round(accuracy_score(y_test,y_pred),3)*100} %")
print(f"Recall: {round(recall_score(y_test,y_pred),3)*100} %")
print(f"Precision: {round(precision_score(y_test,y_pred),3)*100} %")
print(f"F1-score: {round(f1_score(y_test,y_pred),3)*100} %")
recall_score(y_test, y_pred)
precision_score(y_test, y_pred)
f1_score(y_test, y_pred)
roc_auc_score (y_test, y_pred) # not sure!
roc_curve(y_test, y_pred) # not sure!
disp = plot_precision_recall_curve(pipe, X_test, y_test)
train_probs = pipe.predict_proba(X_train)[:,1]
probs = pipe.predict_proba(X_test)[:, 1]
train_predictions = pipe.predict(X_train)
print(f'Train ROC AUC Score: {roc_auc_score(y_train, train_probs)}')
print(f'Test ROC AUC Score: {roc_auc_score(y_test, probs)}')
def evaluate_model(y_pred, probs,train_predictions, train_probs):
baseline = {}
baseline['recall']=recall_score(y_test,
[1 for _ in range(len(y_test))])
baseline['precision'] = precision_score(y_test,
[1 for _ in range(len(y_test))])
baseline['roc'] = 0.5
results = {}
results['recall'] = recall_score(y_test, y_pred)
results['precision'] = precision_score(y_test, y_pred)
results['roc'] = roc_auc_score(y_test, probs)
train_results = {}
train_results['recall'] = recall_score(y_train,
train_predictions)
train_results['precision'] = precision_score(y_train, train_predictions)
train_results['roc'] = roc_auc_score(y_train, train_probs)
# for metric in ['recall', 'precision', 'roc']:
# print(f"Baseline: {round(baseline[metric], 2)}Test: {round(results[metric], 2)} Train: {round(train_results[metric], 2)}")
# Calculate false positive rates and true positive rates
base_fpr, base_tpr, _ = roc_curve(y_test, [1 for _ in range(len(y_test))])
model_fpr, model_tpr, _ = roc_curve(y_test, probs)
plt.figure(figsize = (8, 6))
plt.rcParams['font.size'] = 16
# Plot both curves
plt.plot(base_fpr, base_tpr, 'b', label = 'baseline')
plt.plot(model_fpr, model_tpr, 'r', label = 'model')
plt.legend(); plt.xlabel('False Positive Rate');
plt.ylabel('True Positive Rate'); plt.title('ROC Curves');
plt.show()
# Recall Baseline: 1.0 Test: 0.92 Train: 0.93
# Precision Baseline: 0.48 Test: 0.9 Train: 0.91
# Roc Baseline: 0.5 Test: 0.97 Train: 0.97
evaluate_model(y_pred,probs,train_predictions,train_probs)
def plot_confusion_matrix(cm, classes, normalize = False,
title='Confusion matrix',
cmap=plt.cm.Greens): # can change color
plt.figure(figsize = (10, 10))
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title, size = 24)
plt.colorbar(aspect=4)
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45, size = 14)
plt.yticks(tick_marks, classes, size = 14)
fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
# Label the plot
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): plt.text(j, i, format(cm[i, j], fmt),
fontsize = 20,
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.grid(None)
plt.tight_layout()
plt.ylabel('True label', size = 18)
plt.xlabel('Predicted label', size = 18)
# Let's plot it out
cm = confusion_matrix(y_test, y_pred)
plot_confusion_matrix(cm, classes = ['0 - Susceptible', '1 - Resistant'],
title = 'R/S Confusion Matrix')
print(rf_classifier.feature_importances_)
print(f" There are {len(rf_classifier.feature_importances_)} features in total")
#%%
####################################
# Model 2: case of stratified K-fold
# Logistic regression
# MinMaxScaler
# https://towardsdatascience.com/stratified-k-fold-what-it-is-how-to-use-it-cf3d107d3ea2 [ Didn't work!]
# https://www.geeksforgeeks.org/stratified-k-fold-cross-validation/
####################################
print('Class Ratio:',
sum(Y)/len(Y))
print('Class Ratio:',
sum(my_df['resistance'])/len(my_df['resistance']))
seed_skf = 50
skf = StratifiedKFold(n_splits = 10
, shuffle = True
, random_state = seed_skf)
lst_accu_stratified = []
scaler = preprocessing.MinMaxScaler()
X_scaled = scaler.fit_transform(X)
#X_scaled = X_scaled[:,[1,2,3]]
#lr = linear_model.LogisticRegression(class_weight = 'unbalanced')
lr = linear_model.LogisticRegression()
for train_index, test_index in skf.split(X, Y):
#print(train_index)
#print(test_index)
x_train_fold, x_test_fold = X_scaled[train_index], X_scaled[test_index]
y_train_fold, y_test_fold = Y[train_index], Y[test_index]
lr.fit(x_train_fold, y_train_fold)
lst_accu_stratified.append(lr.score(x_test_fold, y_test_fold))
# print output
print('List of possible accuracy', lst_accu_stratified)
print('Max accuracy:', max(lst_accu_stratified)*100, "%")
print('Min accuracy:', min(lst_accu_stratified)*100, "%")
print('Mean accuracy:', mean(lst_accu_stratified)*100,"%")
print('St Dev:', stdev(lst_accu_stratified)*100,"%")
#%%
#--------------------------------------
# Model2.1: same one but with pipeline
# slightly different results when using
# transformed or untransformed values!
#--------------------------------------
model_logisP = Pipeline(steps = [('preprocess', preprocessing.MinMaxScaler())
, ('logis', LogisticRegression(class_weight = 'unbalanced')) ]) # changes stdev
seed_skf = 50
skf = StratifiedKFold(n_splits = 10
, shuffle = True
, random_state = seed_skf)
X_array = np.array(X)
lst_accu_stratified = []
for train_index, test_index in skf.split(X_array, Y):
x_train_fold, x_test_fold = X_array[train_index], X_array[test_index]
y_train_fold, y_test_fold = Y[train_index], Y[test_index]
model_logisP.fit(x_train_fold, y_train_fold)
lst_accu_stratified.append(model_logisP.score(x_test_fold, y_test_fold))
# print output
print('List of possible accuracy', lst_accu_stratified)
print('Max accuracy:', max(lst_accu_stratified)*100, "%")
print('Min accuracy:', min(lst_accu_stratified)*100, "%")
print('Mean accuracy:', mean(lst_accu_stratified)*100,"%")
print('St Dev:', stdev(lst_accu_stratified)*100,"%")
####################################
# Model 3: stratified K-fold
# Random forest
# MinMaxScaler
# X: needs to be an array for str Kfold
####################################
model_rf = Pipeline(steps = [('preprocess', preprocessing.MinMaxScaler())
, ('rf' , RandomForestClassifier(n_estimators=100, random_state=42))])
seed_skf = 50
skf = StratifiedKFold(n_splits = 10
, shuffle = True
, random_state = seed_skf)
X_array = np.array(X)
lst_accu_stratified_rf = []
for train_index, test_index in skf.split(X_array, Y):
x_train_fold, x_test_fold = X_array[train_index], X_array[test_index]
y_train_fold, y_test_fold = Y[train_index], Y[test_index]
model_rf.fit(x_train_fold, y_train_fold)
lst_accu_stratified_rf.append(model_rf.score(x_test_fold, y_test_fold))
# print output
print('List of possible accuracy', lst_accu_stratified_rf)
print('Max accuracy:', max(lst_accu_stratified_rf)*100, "%")
print('Min accuracy:', min(lst_accu_stratified_rf)*100, "%")
print('Mean accuracy:', mean(lst_accu_stratified_rf)*100,"%")
print('St Dev:', stdev(lst_accu_stratified_rf)*100,"%")
####################################
# Model 4: Cross validate K-fold
# Random forest
# MinMaxScaler
# X: needs to be an array for Kfold
# FIXME: DOESNT WORK BECAUSE MSE is for LR, not Logistic or random?
####################################
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import cross_validate
score_fn = make_scorer(mean_squared_error)
scores = cross_validate(model_rf, X_train, y_train
, scoring = score_fn
, cv = 10)
from itertools import combinations
def train(X):
return cross_validate(model_rf, X, y_train
, scoring = score_fn
, cv = 10
, return_estimator = True)['test_score']
scores = [train(X_train.loc[:,vars]) for vars in combinations(X_train.columns,11)]
means = [score.mean() for score in scores]
#%%
# https://stackoverflow.com/questions/52316237/finding-logistic-regression-weights-from-k-fold-cv
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import KFold
kf = KFold(n_splits=10, shuffle=True, random_state=42)
logistic = LogisticRegressionCV(Cs=2, fit_intercept=True, cv=kf, verbose =1, random_state=42)
logistic.fit(X_train, y_train)
print("Train Coefficient:" , logistic.coef_) #weights of each feature
print("Train Intercept:" , logistic.intercept_) #value of intercept
#%%
# https://machinelearningmastery.com/repeated-k-fold-cross-validation-with-python/
from sklearn.model_selection import cross_val_score
from numpy import std
cv = KFold(n_splits=10, random_state=1, shuffle=True)
scores = cross_val_score(model_rf, X,Y, scoring='accuracy', cv=cv, n_jobs=-1)
scores2 = cross_val_score(model_logisP, X, Y, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))
print('Accuracy: %.3f (%.3f)' % (mean(scores2), stdev(scores2)))

View file

@ -0,0 +1,172 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 3 17:08:18 2022
@author: tanu
"""
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import os
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score
import pandas as pd
#%%
homedir = os.path.expanduser("~")
os.chdir(homedir + "/git/ML_AI_training/test_data")
#gene 'pncA'
#drug = 'pyrazinamide'
#==============
# directories
#==============
datadir = homedir + '/git/Data/'
indir = datadir + drug + '/input/'
outdir = datadir + drug + '/output/'
#=======
# input
#=======
# this needs to be merged_df2 or merged_df3?
infile_ml1 = outdir + gene.lower() + '_merged_df3.csv'
#infile_ml2 = outdir + gene.lower() + '_merged_df2.csv'
my_df = pd.read_csv(infile_ml1)
my_df.dtypes
my_df_cols = my_df.columns
gene_baiscL = ['pnca']
geneL_naL = ['gid', 'rpob']
geneL_ppi2L = ['alr', 'embb', 'katg', 'rpob']
#%%============================================================================
# GET Y
# Y = my_df.loc[:,drug] #has NA
dm_om_map = {'DM': 1, 'OM': 0}
my_df['resistance'] = my_df['mutation_info_labels'].map(dm_om_map)
# sanity check
my_df['resistance'].value_counts()
my_df['mutation_info_labels'].value_counts()
Y = my_df['resistance']
# GET X
cols = my_df.columns
X_stability = my_df[['ligand_distance'
, 'ligand_affinity_change'
, 'duet_stability_change'
, 'ddg_foldx'
, 'deepddg'
, 'ddg_dynamut2']]
X_evol = my_df[['consurf_score'
, 'snap2_score'
, 'snap2_accuracy_pc']]
X_str = my_df[['asa'
, 'rsa'
, 'kd_values'
, 'rd_values']]
#%% try combinations
X_vars = X_stability
X_vars = X_evol
X_vars = X_str
X_vars = pd.concat([X_stability, X_evol, X_str], axis = 1)
X_vars = pd.concat([X_stability, X_evol], axis = 1)
X_vars = pd.concat([X_stability, X_str], axis = 1)
X_vars = pd.concat([X_evol, X_str], axis = 1)
#%%
X_vars.shape[1]
# TODO: stratified cross validate
# Train-test Split
rs = {'random_state': 42}
X_train, X_test, y_train, y_test = train_test_split(X_vars,
Y,
test_size = 0.33,
random_state = 42)
# Classification - Model Pipeline
def modelPipeline(X_train, X_test, y_train, y_test):
log_reg = LogisticRegression(**rs)
nb = BernoulliNB()
knn = KNeighborsClassifier()
svm = SVC(**rs)
mlp = MLPClassifier(max_iter=500, **rs)
dt = DecisionTreeClassifier(**rs)
et = ExtraTreesClassifier(**rs)
rf = RandomForestClassifier(**rs)
xgb = XGBClassifier(**rs, verbosity=0)
clfs = [
('Logistic Regression', log_reg),
('Naive Bayes', nb),
('K-Nearest Neighbors', knn),
('SVM', svm),
('MLP', mlp),
('Decision Tree', dt),
('Extra Trees', et),
('Random Forest', rf),
('XGBoost', xgb)
]
pipelines = []
scores_df = pd.DataFrame(columns=['Model', 'F1_Score', 'Precision', 'Recall', 'Accuracy', 'ROC_AUC'])
for clf_name, clf in clfs:
pipeline = Pipeline(steps=[
('scaler', MinMaxScaler()),
('classifier', clf)
]
)
pipeline.fit(X_train, y_train)
# Model predictions
y_pred = pipeline.predict(X_test)
# F1-Score
fscore = f1_score(y_test, y_pred)
# Precision
pres = precision_score(y_test, y_pred)
# Recall
rcall = recall_score(y_test, y_pred)
# Accuracy
accu = accuracy_score(y_test, y_pred)
# ROC_AUC
roc_auc = roc_auc_score(y_test, y_pred)
pipelines.append(pipeline)
scores_df = scores_df.append({
'Model' : clf_name,
'F1_Score' : fscore,
'Precision' : pres,
'Recall' : rcall,
'Accuracy' : accu,
'ROC_AUC' : roc_auc
},
ignore_index = True)
return pipelines, scores_df
modelPipeline(X_train, X_test, y_train, y_test)

View file

@ -0,0 +1,207 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Mar 4 14:54:30 2022
@author: tanu
"""
import os, sys
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score
#%%
homedir = os.path.expanduser("~")
os.chdir(homedir + "/git/ML_AI_training/")
# my function
from MultClassPipe import MultClassPipeline
gene = 'pncA'
drug = 'pyrazinamide'
#==============
# directories
#==============
datadir = homedir + '/git/Data/'
indir = datadir + drug + '/input/'
outdir = datadir + drug + '/output/'
#=======
# input
#=======
infile_ml1 = outdir + gene.lower() + '_merged_df3.csv'
#infile_ml2 = outdir + gene.lower() + '_merged_df2.csv'
my_df = pd.read_csv(infile_ml1)
my_df.dtypes
my_df_cols = my_df.columns
geneL_basic = ['pnca']
geneL_na = ['gid']
geneL_na_ppi2 = ['rpob']
geneL_ppi2 = ['alr', 'embb', 'katg']
#%% get cols
mycols = my_df.columns
#%%============================================================================
# GET Y
# Target1: mutation_info_labels
dm_om_map = {'DM': 1, 'OM': 0}
target1 = my_df['mutation_info_labels'].map(dm_om_map)
# Target2: drug
drug_labels = drug + '_labels'
drug_labels
my_df[drug_labels] = my_df[drug].map({1: 'resistant', 0: 'sensitive'})
my_df[drug_labels].value_counts()
my_df[drug_labels] = my_df[drug_labels].fillna('unknown')
my_df[drug_labels].value_counts()
target2 = my_df[drug_labels]
# Target3: drtype
drtype_labels = 'drtype_labels'
my_df[drtype_labels] = my_df['drtype'].map({'Sensitive' : 0
, 'Other' : 0
, 'Pre-MDR' : 1
, 'MDR' : 1
, 'Pre-XDR' : 1
, 'XDR' : 1})
# target3 = my_df['drtype']
target3 = my_df[drtype_labels]
# target4
drtype_labels2 = 'drtype_labels2'
my_df[drtype_labels2] = my_df['drtype'].map({'Sensitive' : 0
, 'Other' : 0
, 'Pre-MDR' : 1
, 'MDR' : 1
, 'Pre-XDR' : 2
, 'XDR' : 2})
target4 = my_df[drtype_labels2]
# sanity checks
target1.value_counts()
my_df['mutation_info_labels'].value_counts()
target2.value_counts()
my_df[drug_labels].value_counts()
target3.value_counts()
my_df['drtype'].value_counts()
target4.value_counts()
my_df['drtype'].value_counts()
#%%
# GET X
common_cols_stabilty = ['ligand_distance'
, 'ligand_affinity_change'
, 'duet_stability_change'
, 'ddg_foldx'
, 'deepddg'
, 'ddg_dynamut2']
# Build stability columns ~ gene
if gene.lower() in geneL_basic:
x_stability_cols = common_cols_stabilty
if gene.lower() in geneL_ppi2:
x_stability_cols = common_cols_stabilty + ['mcsm_ppi2_affinity'
, 'interface_dist']
if gene.lower() in geneL_na:
x_stability_cols = common_cols_stabilty + ['mcsm_na_affinity']
if gene.lower() in geneL_na_ppi2:
x_stability_cols = common_cols_stabilty + ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist']
#D1148 get rid of
na_index = my_df['mutationinformation'].index[my_df['mcsm_na_affinity'].apply(np.isnan)]
my_df = my_df.drop(index=na_index)
X_strF = ['asa'
, 'rsa'
, 'kd_values'
, 'rd_values']
X_evolF = ['consurf_score'
, 'snap2_score'
, 'snap2_accuracy_pc']
# TODO: ADD ED values
# Problematic due to NA
# X_genomicF = ['af'
# , 'or_mychisq'
# , 'or_logistic'
# , 'or_fisher'
# , 'pval_fisher']
#%% try combinations
X_vars1 = my_df[x_stability_cols]
X_vars2 = my_df[X_strF]
X_vars3 = my_df[X_evolF]
#X_vars4 = my_df[X_genomicF]
#X_vars4 = X_vars4.fillna('unknown') # need one hot encoder!
X_vars5 = my_df[x_stability_cols + X_strF]
X_vars6 = my_df[x_stability_cols + X_evolF]
#X_vars7 = my_df[x_stability_cols + X_genomicF]
X_vars8 = my_df[X_strF + X_evolF]
#X_vars9 = my_df[X_strF + X_genomicF]
#X_vars10 = my_df[X_evolF + X_genomicF]
X_vars11 = my_df[x_stability_cols + X_strF + X_evolF]
#X_vars12 = my_df[x_stability_cols + X_strF + X_evolF + X_genomicF]
numerical_features_names = x_stability_cols + X_strF + X_evolF
# separate ones for foldx?
categorical_features_names = ['ss_class'
, 'wt_prop_water'
# , 'lineage_labels' # misleading if using merged_df3
, 'mut_prop_water'
, 'wt_prop_polarity'
, 'mut_prop_polarity'
, 'wt_calcprop'
, 'mut_calcprop'
, 'active_aa_pos']
numerical_features_df = my_df[numerical_features_names]
numerical_features_df.shape
categorical_features_df = my_df[categorical_features_names]
categorical_features_df.shape
all_features_df = my_df[numerical_features_names + categorical_features_names]
all_features_df.shape
#%%
X_vars1.shape[1]
X_vars5.shape[1]
# TODO: stratified cross validate
# Train-test Split
# TARGET1
X_train, X_test, y_train, y_test = train_test_split(X_vars1,
target1,
test_size = 0.33,
random_state = 42)
t1_res = MultClassPipeline(X_train, X_test, y_train, y_test)
t1_res
# TARGET3
X_train3, X_test3, y_train3, y_test3 = train_test_split(X_vars5,
target3,
test_size = 0.33,
random_state = 42)
t3_res = MultClassPipeline(X_train3, X_test3, y_train3, y_test3)
t3_res
#%%

View file

@ -0,0 +1,207 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Mar 5 12:57:32 2022
@author: tanu
"""
import os, sys
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer
from sklearn.metrics import classification_report
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
#############################
# trying feature selection
#############################
#%%
model= Pipeline(steps = [
('pre', MinMaxScaler()),
('reg', LogisticRegression(class_weight = 'balanced'))])
def precision(y_true,y_pred):
return precision_score(y_true,y_pred,pos_label = 1)
def recall(y_true,y_pred):
return recall_score(y_true, y_pred, pos_label = 1)
def f1(y_true,y_pred):
return f1_score(y_true, y_pred, pos_label = 1)
acc = make_scorer(accuracy_score)
prec = make_scorer(precision)
rec = make_scorer(recall)
f1 = make_scorer(f1)
output = cross_validate(model, X_train, y_train
, scoring = {'acc' : acc
,'prec': prec
,'rec' : rec
,'f1' : f1}
, cv = 10
, return_train_score = False)
pd.DataFrame(output).mean()
#%%
# classification_repor: lowest scores but does it give numbers for all your classes!
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
f1_score(y_test, y_pred)
roc_auc_score (y_test, y_pred) # not sure!
#roc_curve(y_test, y_pred)
classification_report(y_test, y_pred)
target_names = {1:'Resistant', 0:'Sensitive'}
print(classification_report(y_test
, y_pred
#, target_names=y_test.map(target_names)
))
#%%NOT SURE!
from itertools import combinations
def train(X):
return cross_validate(model, X, y_train
#, scoring = make_scorer(accuracy_score)
, scoring = {'acc' : acc
,'prec' : prec
,'rec' : rec
,'f1' : f1}
, cv = 10
, return_train_score = False)
#, return_estimator = True)['test_score']
scores = [train(X_train.loc[:,vars]) for vars in combinations(X_train.columns, len(X_train.columns))]
means = [score.mean() for score in scores]
means
#%%
# TO TRY
https://rasbt.github.io/mlxtend/
# stackoverflow
# informative post
https://datascience.stackexchange.com/questions/937/does-scikit-learn-have-a-forward-selection-stepwise-regression-algorithm
https://datascience.stackexchange.com/questions/24405/how-to-do-stepwise-regression-using-sklearn/24447#24447
https://stats.stackexchange.com/questions/204141/difference-between-selecting-features-based-on-f-regression-and-based-on-r2
# 0.24 version, it supports
https://scikit-learn.org/stable/auto_examples/release_highlights/plot_release_highlights_0_24_0.html#new-sequentialfeatureselector-transformer
https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectFromModel.html
https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SequentialFeatureSelector.html
https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFECV.html
https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.f_regression.html
https://www.scikit-yb.org/en/latest/api/model_selection/rfecv.html
#GridSearchCV
#ParameterGrid
#RandomizedSearchCV
#https://medium.com/analytics-vidhya/hyper-parameter-tuning-gridsearchcv-vs-randomizedsearchcv-499862e3ca5
#%% RFE: Feature selection in classification
# others in example
# https://towardsdatascience.com/feature-selection-techniques-for-classification-and-python-tips-for-their-application-10c0ddd7918b
# https://towardsdatascience.com/feature-selection-using-python-for-classification-problem-b5f00a1c7028
model_logistic = LogisticRegression(solver='lbfgs'
, multi_class = 'multinomial'
, max_iter = 1000)
model_logistic = LogisticRegression()
sel_rfe_logistic = RFE(estimator = model_logistic
, n_features_to_select = 4
, step = 1)
X_train_rfe_logistic = sel_rfe_logistic.fit_transform(X_train, y_train)
print(sel_rfe_logistic.get_support())
print(sel_rfe_logistic.ranking_)
#%% RFECV
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFECV.html
target = target1
target = target3
target = target4
X_train, X_test, y_train, y_test = train_test_split(X_vars1,
target,
test_size = 0.33,
random_state = 42)
X_train, X_test, y_train, y_test = train_test_split(X_vars2,
target,
test_size = 0.33,
random_state = 42)
X_train, X_test, y_train, y_test = train_test_split(X_vars3,
target,
test_size = 0.33,
random_state = 42)
X_train, X_test, y_train, y_test = train_test_split(X_vars5,
target,
test_size = 0.33,
random_state = 42)
X_train, X_test, y_train, y_test = train_test_split(X_vars11,
target,
test_size = 0.33,
random_state = 42)
model_logistic2 = LogisticRegression()
sel_rfe_logistic = RFECV(estimator = model_logistic2
, cv = 10
, step = 1)
X_train_rfe_logistic = sel_rfe_logistic.fit_transform(X_train, y_train)
print(sel_rfe_logistic.get_support())
X_train.columns
print(sel_rfe_logistic.ranking_)
#%%
# TODO: imputation
# Find out the best way to impute values!
#from sklearn.impute import SimpleImputer
# https://towardsdatascience.com/whats-the-best-way-to-handle-nan-values-62d50f738fc
#KNN and MICE
my_df2 = pd.read_csv(infile_ml1)
genomicF = ['af'
, 'beta_logistic'
, 'or_logistic'
, 'pval_logistic'
, 'se_logistic'
, 'zval_logistic'
, 'ci_low_logistic'
, 'ci_hi_logistic'
, 'or_mychisq'
, 'log10_or_mychisq'
, 'or_fisher'
, 'pval_fisher'
, 'neglog_pval_fisher'
, 'ci_low_fisher'
, 'ci_hi_fisher'
, 'est_chisq'
, 'pval_chisq']
# X_genomicF = ['af'
# , 'or_mychisq'
# , 'or_logistic'
# , 'or_fisher'
# , 'pval_fisher']
my_df2[genomicF].isna().sum()
my_df2[genomicF] = my_df2[genomicF].fillna(value='unknown')

View file

@ -0,0 +1,118 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Mar 5 12:57:32 2022
@author: tanu
"""
#%%
# data, etc for now comes from my_data6.py and/or my_data5.py
#%% try combinations
#import sys, os
#os.system("imports.py")
#%%
seed = 42
features_to_encode = list(X_train.select_dtypes(include = ['object']).columns)
col_trans = make_column_transformer(
(OneHotEncoder(),features_to_encode),
remainder = "passthrough"
)
rf_classifier = RandomForestClassifier(
min_samples_leaf=50,
n_estimators=150,
bootstrap=True,
oob_score=True,
n_jobs=-1,
random_state=seed,
max_features='auto')
pipe = make_pipeline(col_trans, rf_classifier)
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
#%%
all_features_df.shape
X_train, X_test, y_train, y_test = train_test_split(all_features_df,
target1,
test_size = 0.33,
random_state = 42)
preprocessor = ColumnTransformer(
transformers=[
('num', MinMaxScaler() , numerical_features_df)
,('cat', OneHotEncoder(), categorical_features_df)])
seed = 42
rf_classifier = RandomForestClassifier(
min_samples_leaf=50,
n_estimators=150,
bootstrap=True,
oob_score=True,
n_jobs=-1,
random_state=seed,
max_features='auto')
preprocessor.fit(all_features_df)
preprocessor.transform(all_features_df)
model = Pipeline(steps = [
('preprocess', preprocessor)
,('regression',linear_model.LogisticRegression())
])
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred
def precision(y_true,y_pred):
return precision_score(y_true,y_pred,pos_label = 1)
def recall(y_true,y_pred):
return recall_score(y_true, y_pred, pos_label = 1)
def f1(y_true,y_pred):
return f1_score(y_true, y_pred, pos_label = 1)
acc = make_scorer(accuracy_score)
prec = make_scorer(precision)
rec = make_scorer(recall)
f1 = make_scorer(f1)
output = cross_validate(model, X_train, y_train
, scoring = {'acc' : acc
,'prec': prec
,'rec' : rec
,'f1' : f1}
, cv = 10
, return_train_score = False)
pd.DataFrame(output).mean()
#%% with feature selection
preprocessor.fit(numerical_features_df)
preprocessor.transform(numerical_features_df)
model = Pipeline(steps = [
('preprocess', preprocessor)
,('regression',linear_model.LogisticRegression())
])
selector_logistic = RFECV(estimator = model
, cv = 10
, step = 1)
X_trainN, X_testN, y_trainN, y_testN = train_test_split(numerical_features_df
, target1
, test_size = 0.33
, random_state = 42)
selector_logistic_xtrain = selector_logistic.fit_transform(X_trainN, y_trainN)
print(sel_rfe_logistic.get_support())
X_trainN.columns
print(sel_rfe_logistic.ranking_)

View file

@ -0,0 +1,144 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Mar 5 12:57:32 2022
@author: tanu
"""
#%%
# data, etc for now comes from my_data6.py and/or my_data5.py
#%% try combinations
#import sys, os
#os.system("imports.py")
def precision(y_true,y_pred):
return precision_score(y_true,y_pred,pos_label = 1)
def recall(y_true,y_pred):
return recall_score(y_true, y_pred, pos_label = 1)
def f1(y_true,y_pred):
return f1_score(y_true, y_pred, pos_label = 1)
#%%
numerical_features_df.shape
categorical_features_df.shape
all_features_df.shape
#%%
target = target1
#target = target3
X_trainN, X_testN, y_trainN, y_testN = train_test_split(numerical_features_df,
target,
test_size = 0.33,
random_state = 42)
X_trainC, X_testC, y_trainC, y_testC = train_test_split(categorical_features_df,
target,
test_size = 0.33,
random_state = 42)
X_train, X_test, y_train, y_test = train_test_split(all_features_df,
target,
test_size = 0.33,
random_state = 42)
#%%
#%%
preprocessor = ColumnTransformer(
transformers=[
('num', MinMaxScaler() , numerical_features_names)
,('cat', OneHotEncoder(), categorical_features_names)
], remainder = 'passthrough')
f = preprocessor.fit(numerical_features_df)
f2 = preprocessor.transform(numerical_features_df)
f3 = preprocessor.fit_transform(numerical_features_df)
(f3==f2).all()
preprocessor.fit_transform(numerical_features_df)
#preprocessor.fit_transform(all_features_df)
#%%
model_log = Pipeline(steps = [
('preprocess', preprocessor)
#,('log_reg', linear_model.LogisticRegression())
,('log_reg', LogisticRegression(
class_weight = 'unbalanced'))
])
model = model_log
#%%
seed = 42
model_rf = Pipeline(steps = [
('preprocess', preprocessor)
,('rf', RandomForestClassifier(
min_samples_leaf=50,
n_estimators=150,
bootstrap=True,
oob_score=True,
n_jobs=-1,
random_state=seed,
max_features='auto'))
])
model = model_rf
#%%
model.fit(X_trainN, y_trainN)
y_pred = model.predict(X_testN)
y_pred
acc = make_scorer(accuracy_score)
prec = make_scorer(precision)
rec = make_scorer(recall)
f1 = make_scorer(f1)
output = cross_validate(model, X_trainN, y_trainN
, scoring = {'acc' : acc
,'prec': prec
,'rec' : rec
,'f1' : f1}
, cv = 10
, return_train_score = False)
pd.DataFrame(output).mean()
#%% Run multiple models using MultClassPipeline
# only good for numerical features as categ features is not supported yet!
t1_res = MultClassPipeline2(X_trainN, X_testN, y_trainN, y_testN, input_df = all_features_df)
t1_res
#%%
# https://machinelearningmastery.com/columntransformer-for-numerical-and-categorical-data/
#Each transformer is a three-element tuple that defines the name of the transformer, the transform to apply, and the column indices to apply it to. For example:
# (Name, Object, Columns)
# Determine categorical and numerical features
numerical_ix = all_features_df.select_dtypes(include=['int64', 'float64']).columns
numerical_ix
categorical_ix = all_features_df.select_dtypes(include=['object', 'bool']).columns
categorical_ix
# Define the data preparation for the columns
t = [('cat', OneHotEncoder(), categorical_ix)
, ('num', MinMaxScaler(), numerical_ix)]
col_transform = ColumnTransformer(transformers=t
, remainder='passthrough')
# create pipeline (unlike example above where the col transfer was a preprocess step and it was fit_transformed)
pipeline = Pipeline(steps=[('prep', col_transform)
, ('classifier', LogisticRegression())])
#%% Added this to the MultClassPipeline
tN_res = MultClassPipeline(X_trainN, X_testN, y_trainN, y_testN)
tN_res
t2_res = MultClassPipeline2(X_train, X_test, y_train, y_test, input_df = all_features_df)
t2_res
t3_res = MultClassPipeSKF(input_df = numerical_features_df
, y_targetF = target1
, var_type = 'numerical'
, skf_splits = 10)
t3_res
t4_res = MultClassPipeSKF(input_df = all_features_df
, y_targetF = target1
, var_type = 'mixed'
, skf_splits = 10)
t4_res

405
earlier_versions/p_jr_d1.py Normal file
View file

@ -0,0 +1,405 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Feb 17 14:52:55 2022
@author: tanu
"""
from sklearn.datasets import load_boston
from sklearn import linear_model
from sklearn import preprocessing
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
print(np.__version__)
print(pd.__version__)
boston = load_boston()
dir(boston)
#['DESCR', 'data', 'data_module', 'feature_names', 'filename', 'target']
X, y = boston.data, boston.target
df = pd.DataFrame(X, columns = boston.feature_names)
df['MEDV'] = y
sns.scatterplot(x = 'CRIM', y = 'MEDV', data = df)
plt.show()
#Model fitting
#To fit a model using just a single predictor we first extract the training variables.
X_train = df['CRIM']
y_train = y
# Unfortunately, sklearn s various model fitting functions typically expect a
# two dimensional array for the covariates. Since we have extracted only
# a single feature here it is only one dimensional. We need to reshape the
# X_train values to be the appropriate shape.
# This is not necessary if using more than a single feature.
if len(X_train.values.shape) == 1:
X_train = X_train.values.reshape(-1, 1)
# Create a LinearRegression object: This object is of a broader class of estima-
#tor objects.
model = linear_model.LinearRegression()
model.fit(X_train, y_train)
# We can make predictions from our fitted model with the .predict() method.
new_value = np.array(4.09, ndmin = 2)
model.predict(new_value)
multiple_values = np.array([1, 2, 3], ndmin = 2).T
model.predict(multiple_values)
#Fitted values
#Fitted values of a model typically describes the predicted ŷ for the obser-
#vations X . To get the model fitted values we could just predict from the
#model using the values used to train it.
fitted = model.predict(X_train)
ax = sns.scatterplot(x = 'CRIM', y = 'MEDV', data = df)
sns.lineplot(df['CRIM'], fitted, ax = ax)
plt.show()
# Interpreting the coefficients
# The coefficients of the fitted model are kept in the model.coef_ attribute.
# This gives us the expected change in y for a unit change in X .
model.coef_
#2.3 Multiple linear regression
X_train = df.iloc[:,:3]
grid = sns.PairGrid(data=pd.concat([X_train,pd.Series(y_train,name="MEDV")],axis = 1))
grid.map_offdiag(sns.scatterplot)
grid.map_diag(sns.distplot)
plt.show()
model.fit(X_train, y_train)
new_values = np.array(X_train.mean(), ndmin = 2)
model.predict(new_values)
#Residuals
#In classical statistics, one of our assump-
#tions it that the residuals are normally dis-
#tributed.Small RSS implies the fitted model is
#closer to the observations.
fitted = model.predict(X_train)
resid = y_train - fitted
# Standardise to remove effect of measurement scale
resid = (resid - np.mean(resid))/np.std(resid,ddof = 1)
plt.figure()
for i in range(3):
xvar = X_train.iloc[:,i]
ax = plt.subplot(3, 1, i + 1)
ax.scatter(xvar, resid)
ax.set_xlabel(boston.feature_names[i])
ax.set_ylabel("Residuals")
ax.hlines([-2, 0, 2], np.min(xvar), np.max(xvar))
plt.show()
plt.figure()
ax = plt.subplot(3, 1, 1)
ax.scatter(fitted,resid)
ax.set_xlabel('Fitted values')
ax.set_ylabel('Residuals')
ax = plt.subplot(3,1,2)
ax.scatter(fitted,y_train)
ax.set_xlabel('Fitted values')
ax.set_ylabel('Predicted values')
ax = plt.subplot(3, 1,3)
import scipy.stats as stats
stats.probplot(resid,dist = 'norm',plot = ax)
plt.show()
#Scaling data: many types available
# sklearn comes with many preprocessing transformations in the sklearn.preprocessing module
#Scaling is crucial for many statistical and machine learning algorithms
# • k-means and hierarchical clustering
# Data units & variance play crucial role in cluster selection
# • Using gradient descent optimization
# Scaled data allows the weights to update at an equal speed
# • Scaled data allows the regression coefficients to be compared
#########################################################
# Min-max scaling
# DOESN'T change the shape
# DOES change the bounds, mean and sd
# NOT often used in LR
# used more in GDO (gradient Descent Optimisation)
# sklearn.preprocessing module has a MinMaxScaler() for this
##########################################################
np.random.seed(1)
x_n = np.random.normal(2, 5, 500)
x_t = np.random.standard_t(2, 500)
x_ln = np.random.lognormal(1, 1, 500)
df = pd.DataFrame({ 'Normal': x_n, 'T': x_t, 'Lognormal': x_ln
})
df_long = df.melt(var_name='Distribution')
g = sns.FacetGrid(df_long, col='Distribution',sharex=False)
g.map(plt.hist, 'value', bins = 50)
plt.show()
def min_max(x):
min = np.min(x)
s = (x - min)/(np.max(x) - min)
return (s)
scaled = df.apply(min_max).melt(var_name='Distribution')
scaled['Scaled'] = True
df_long['Scaled'] = False
full_data = pd.concat([df_long, scaled], axis=0)
g = sns.FacetGrid(full_data, col='Distribution'
,row='Scaled'
, sharex=False
, sharey=False)
g.map(plt.hist, 'value', bins = 50)
plt.show()
df.apply([np.mean,np.std])
df.apply(min_max).apply([np.mean,np.std])
# sklearn: MinMaxScaler()
scaler = preprocessing.MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_train_scaled[:1]
##########################################################
# z-score standardisation
# DOESN'T change the shape
# popular in linear models
# DOESN'T effect the predictions
# but makes the size of the coeffs directly comparable
# sklearn.preprocessing module has a StandardScaler() for this
##########################################################
def z_score(x):
mean = np.mean(x)
std = np.std(x, ddof=1)
return (x - mean)/std
scaled = df.apply(z_score).melt(var_name='Distribution')
scaled['Scaled'] = True
full_data = pd.concat([df_long, scaled], axis=0)
g = sns.FacetGrid(full_data, col='Distribution'
, row ='Scaled'
, sharex=False
,sharey=False)
g.map(plt.hist, 'value', bins=50)
###############################################
# Dividing by two standard deviations
# http://www.stat.columbia.edu/
# ~gelman/research/published/ standardizing7.pdf
# One of the downsides of scaling data by z-scoring is that is not obvious
# how this should be handled in the case of categorical variables.
# suggest the use of a rescaling that divides numeric vari-
# ables by two standard deviations, whilst leaving binary encoded categorical
# variables untransformed.
# nothing in sklearn for this
###############################################
from sklearn.base import BaseEstimator, TransformerMixin
class two_sd_scaler(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
self.stds = 2*np.std(X, axis=0, ddof=1)
return self
def transform(self, X, y=None):
return X/self.stds
# Having preprocessed the data this way we can not fit a model to it in the
# same way as before.
model2 = linear_model.LinearRegression()
model2.fit(X_train_scaled, y_train)
#When making predictions on new values we also need to make sure to pass
#the new values through the same preprocessing step.
new_value = np.array(X_train.mean(), ndmin = 2)
new_scaled = scaler.transform(new_value)
pred = model2.predict(new_scaled)
pred
##########################
# 2.5 Creating a pipeline
##########################
# For any training data set and any data for prediction we will want to apply
# the same scaling transformation and use the same model. We could create
# a sklearn.pipeline.Pipeline() to organise the steps to creating the
# estimator
from sklearn.pipeline import Pipeline
model = Pipeline(steps = [('preprocess', preprocessing.StandardScaler())
,('regression', linear_model.LinearRegression())
])
# Having created the Pipeline object we can now fit as before. Calling
# .fit() now however, will first fit the 'preprocess' step and then the
# 'regression' step. When we predict, the new values will also pass through
# both stages of our pipeline.
model.fit(X_train,y_train)
new_values = np.array(X_train.mean(), ndmin = 2)
model.predict(new_values)
#from sklearn.metrics import accuracy_score
#print(accuracy_score(y_test, model.predict(X_test)))
#2.6 Preprocessing categorical variables
# One hot encoding: will take a categorical feature with K categories and
# create a one of K encoding scheme. I.e a set of binary variables for each
# category. Consider the toy data
toy = pd.DataFrame({
'category':['a', 'a', 'b', 'c', 'b']
})
enc = preprocessing.OneHotEncoder()
enc.fit(toy)
enc.transform(toy).toarray()
#Combining preprocessing steps:
# the preprocessing steps into a single operation
# for our Pipeline using a sklearn.compose.ColumnTransformer
toy = pd.DataFrame({
'numeric': [1., 2., 3., 4., 5.],
'category': ['a', 'a', 'b', 'c', 'b']
})
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
numeric_features = ['numeric']
categorical_features = ['category']
preprocessor = ColumnTransformer(transformers=[('num', StandardScaler()
, numeric_features)
,('cat', OneHotEncoder(), categorical_features)])
preprocessor.fit(toy)
preprocessor.transform(toy)
# This preprocessing step could then be a step in the pipeline for a regres-
# sion
model = Pipeline(steps = [('preprocess', preprocessor)
,('regression', linear_model.LinearRegression())])
# fit the preprocessor pipeline to the data
preprocessor.fit(toy)
# transformer will now give the appropriate pre-processing for different types of variables.
preprocessor.transform(toy)
#This preprocessing step could then be a step in the pipeline for a regression
model = Pipeline(steps = [('preprocess', preprocessor)
,('regression', linear_model.LinearRegression())])
#Model Assessment and Feature Selection
#%%#####################################################################
# Accuracy score is only for classification problems.
# For regression problems you can use: R2 Score, MSE (Mean Squared Error), RMSE (Root Mean Squared Error).
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
# read data
iris = datasets.load_iris()
# assign X and y
X = iris.data
y = iris.target
# split data into train and testing part (25 % of data is test size of the data)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)
# preprocess the data
# scaling
scaler = preprocessing.MinMaxScaler()
# fit X_train to scaling
scaler.fit(X_train)
# Apply the scaling/transforamtion to the dta
X_train_scaled = scaler.transform(X_train)
# Choose the required model/s
model2 = linear_model.LinearRegression() # Classification metrics can't handle a mix of multiclass and continuous targets
model2 = DecisionTreeClassifier()
# fit the model to the data for predictions
model2.fit(X_train_scaled, y_train)
# check model performace
print(accuracy_score(y_test, model2.predict(X_test)))
#When making predictions on new values we also need to make sure to pass
#the new values through the same preprocessing step.
new_value = np.array(X_train.mean(), ndmin = 2)
new_scaled = scaler.transform(new_value)
pred = model2.predict(new_scaled)
pred
# or Create a pipeline that standardizes the data then creates a model
# make a pipeline
# PCA(Dimension reduction to two) -> Scaling the data -> DecisionTreeClassification
#https://www.geeksforgeeks.org/pipelines-python-and-scikit-learn/
pipe1 = Pipeline([('pca', PCA(n_components = 2))
, ('std', StandardScaler())
, ('decision_tree', DecisionTreeClassifier())]
, verbose = True)
pipe2 = Pipeline(steps = [('preprocess', preprocessing.StandardScaler())
#,('regression', linear_model.LinearRegression())
,('rf', RandomForestClassifier())
])
# fit pipeline to TRAINING data [X_train and y_train]
pipe1.fit(X_train, y_train)
pipe2.fit(X_train, y_train)
# model prediction on TEST data [X_test and y_test]
print(accuracy_score(y_test, pipe1.predict(X_test)))
print(accuracy_score(y_test, pipe2.predict(X_test)))
print(pipe2.classification_report (y_test, np.argmax(predicted, axis = 1)))
enc = preprocessing.OneHotEncoder()
enc.fit(X_train)
enc.transform(X_train).toarray()
#%%
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
boston = load_boston()
X_train, y_train = pd.DataFrame(boston.data, columns = boston.feature_names), boston.target
model1 = Pipeline(steps = [
('pre', MinMaxScaler()),
('reg', LinearRegression())])
score_fn = make_scorer(mean_squared_error)
scores = cross_validate(model1, X_train, y_train
, scoring = score_fn
, cv = 10)
from itertools import combinations
def train(X):
return cross_validate(model1, X, y_train
, scoring = score_fn
#, return_train_score = False)
, return_estimator = True)['test_score']
scores = [train(X_train.loc[:,vars]) for vars in combinations(X_train.columns, 12)]
means = [score.mean() for score in scores]
means

View file

@ -0,0 +1,7 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Feb 23 11:13:45 2022
@author: tanu
"""

View file

@ -0,0 +1,99 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Mar 7 15:20:42 2022
@author: tanu
"""
fit_time 0.008588
score_time 0.004460
test_acc 0.690148
test_prec 0.690868
test_rec 0.771250
test_f1 0.725441
# RF
fit_time 0.368793
score_time 0.110153
test_acc 0.672537
test_prec 0.664875
test_rec 0.790417
test_f1 0.720224
dtype: float64
#%%
numerical_features: ['ligand_distance', 'ligand_affinity_change'
, 'duet_stability_change', 'ddg_foldx', 'deepddg', 'ddg_dynamut2'
, 'asa', 'rsa', 'kd_values', 'rd_values'
, 'consurf_score', 'snap2_score', 'snap2_accuracy_pc']
Model F1_Score Precision Recall Accuracy ROC_AUC
0 Logistic Regression 0.734177 0.690476 0.783784 0.700000 0.694922
1 Naive Bayes 0.467290 0.757576 0.337838 0.592857 0.608313
2 K-Nearest Neighbors 0.773006 0.707865 0.851351 0.735714 0.728706
3 SVM 0.766467 0.688172 0.864865 0.721429 0.712735
4 MLP 0.725000 0.674419 0.783784 0.685714 0.679771
5 Decision Tree 0.662069 0.676056 0.648649 0.650000 0.650082
6 Extra Trees 0.748387 0.716049 0.783784 0.721429 0.717649
7 Random Forest 0.722581 0.691358 0.756757 0.692857 0.688984
8 Random Forest2 0.731707 0.666667 0.810811 0.685714 0.678133
9 XGBoost 0.692810 0.670886 0.716216 0.664286 0.661138)
all_features: numerical_features + ['ss_class', 'wt_prop_water', 'mut_prop_water', 'wt_prop_polarity',
'mut_prop_polarity', 'wt_calcprop', 'mut_calcprop', 'active_aa_pos']
Model F1_Score Precision Recall Accuracy ROC_AUC
0 Logistic Regression 0.757764 0.701149 0.824324 0.721429 0.715192
1 Naive Bayes 0.620690 0.633803 0.608108 0.607143 0.607084
2 K-Nearest Neighbors 0.619355 0.592593 0.648649 0.578571 0.574324
3 SVM 0.766467 0.688172 0.864865 0.721429 0.712735
4 MLP 0.738854 0.698795 0.783784 0.707143 0.702498
5 Decision Tree 0.666667 0.701493 0.635135 0.664286 0.666052
6 Extra Trees 0.728395 0.670455 0.797297 0.685714 0.678952
7 Random Forest 0.763636 0.692308 0.851351 0.721429 0.713554
8 Random Forest2 0.746988 0.673913 0.837838 0.700000 0.691646
9 XGBoost 0.710526 0.692308 0.729730 0.685714 0.683047)
#%%
Model F1_Score Precision Recall Accuracy ROC_AUC
0Num Logistic Regression 0.734177 0.690476 0.783784 0.700000 0.694922
0All Logistic Regression 0.757764 0.701149 0.824324 0.721429 0.715192
1Num Naive Bayes 0.467290 0.757576 0.337838 0.592857 0.608313
1All Naive Bayes 0.620690 0.633803 0.608108 0.607143 0.607084
2Num K-Nearest Neighbors 0.773006 0.707865 0.851351 0.735714 0.728706 ** 'Num' is better than 'All'
2All K-Nearest Neighbors 0.619355 0.592593 0.648649 0.578571 0.574324
3Num SVM 0.766467 0.688172 0.864865 0.721429 0.712735
3All SVM 0.766467 0.688172 0.864865 0.721429 0.712735
4Num MLP 0.725000 0.674419 0.783784 0.685714 0.679771
4All MLP 0.738854 0.698795 0.783784 0.707143 0.702498
5Num Decision Tree 0.662069 0.676056 0.648649 0.650000 0.650082 ** marginal, equivalent
5All Decision Tree 0.666667 0.701493 0.635135 0.664286 0.666052
6Num Extra Trees 0.748387 0.716049 0.783784 0.721429 0.717649 ** marginal, equivalent
6All Extra Trees 0.728395 0.670455 0.797297 0.685714 0.678952
7Num Random Forest 0.722581 0.691358 0.756757 0.692857 0.688984
7All Random Forest 0.763636 0.692308 0.851351 0.721429 0.713554
8Num Random Forest2 0.731707 0.666667 0.810811 0.685714 0.678133
8All Random Forest2 0.746988 0.673913 0.837838 0.700000 0.691646
9Num XGBoost 0.692810 0.670886 0.716216 0.664286 0.661138)
9All XGBoost 0.710526 0.692308 0.729730 0.685714 0.683047)
#%%
Model F1_Score Precision Recall Accuracy ROC_AUC
0 Logistic Regression 0.757764 0.701149 0.824324 0.721429 0.715192
1 Naive Bayes 0.628571 0.666667 0.594595 0.628571 0.630631
2 K-Nearest Neighbors 0.666667 0.623529 0.716216 0.621429 0.615684
3 SVM 0.766467 0.688172 0.864865 0.721429 0.712735
4 MLP 0.726115 0.686747 0.770270 0.692857 0.688165
5 Decision Tree 0.647482 0.692308 0.608108 0.650000 0.652539
6 Extra Trees 0.760736 0.696629 0.837838 0.721429 0.714373
7 Random Forest 0.736196 0.674157 0.810811 0.692857 0.685708
8 Random Forest2 0.736196 0.674157 0.810811 0.692857 0.685708
9 XGBoost 0.710526 0.692308 0.729730 0.685714 0.683047)

View file

@ -0,0 +1,69 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Feb 21 13:06:25 2022
@author: tanu
"""
X_train
scaler = preprocessing.MinMaxScaler()
scaler.fit(X_train)
x_train_scaled = scaler.transform(X_train)
x_train_scaled
foo = scaler.fit(X_train)
x_train_scaled2 = foo.transform(X_train)
x_train_scaled2
(x_train_scaled == x_train_scaled2).all()
toy = pd.DataFrame({
'numeric': [1., 2., 3., 4., 5.],
'category': ['a', 'a', 'b', 'c', 'b']
})
numeric_features = ['numeric']
categorical_features = ['category']
preprocessor = ColumnTransformer(transformers=[('num', StandardScaler(), numeric_features),
('cat', OneHotEncoder(), categorical_features)
])
preprocessor.fit(toy)
bar = preprocessor.transform(toy)
bar
#############
toy2 = pd.DataFrame({
'numeric': [1., 2., 3., 4., 5.],
'numeric2': [1., 2., 3., 4., 6.],
'category': ['a', 'a', 'b', 'c', 'b'],
'category2': ['b', 'a', 'b', 'e', 'f']
})
numeric_features = ['numeric', 'numeric2']
categorical_features = ['category', 'category2']
preprocessor = ColumnTransformer(transformers=[
('num', StandardScaler(), numeric_features),
('cat', OneHotEncoder(), categorical_features)
])
preprocessor.fit(toy2)
bar2 = preprocessor.transform(toy2)
bar2
####
import pandas as pd
from pandas import DataFrame
import numpy as np
from sklearn.decomposition import PCA
from pandas import DataFrame
pca = PCA(n_components = 2)
pca.fit(toy2.iloc[:, 0:2])
columns = ['pca_%i' % i for i in range(2)]
df_pca = DataFrame(pca.transform(toy2.iloc[:, 0:2])
, columns=columns
, index=toy2.index)
df_pca.head()

161
earlier_versions/skf_mm.py Normal file
View file

@ -0,0 +1,161 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 10 10:33:15 2022
@author: tanu
"""
#%% Stratified KFold: Multiple_models:
input_df = numerical_features_df
#X_array = np.array(input_df)
var_type = 'numerical'
input_df = all_features_df
#X_array = np.array(input_df)
var_type = 'mixed'
input_df = categorical_features_df
#X_array = np.array(input_df)
var_type = 'categorical'
targetF = target1
#==============================================================================
numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
numerical_ix
categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns
categorical_ix
# Determine preprocessing steps ~ var_type
if var_type == 'numerical':
t = [('num', MinMaxScaler(), numerical_ix)]
if var_type == 'categorical':
t = [('cat', OneHotEncoder(), categorical_ix)]
if var_type == 'mixed':
t = [('cat', OneHotEncoder(), categorical_ix)
, ('num', MinMaxScaler(), numerical_ix)]
###############################################################################
col_transform = ColumnTransformer(transformers = t
, remainder='passthrough')
###############################################################################
rs = {'random_state': 42}
#log_reg = LogisticRegression(**rs)
log_reg = LogisticRegression(class_weight = 'balanced')
nb = BernoulliNB()
rf = RandomForestClassifier(**rs)
clfs = [('Logistic Regression', log_reg)
,('Naive Bayes' , nb)
, ('Random Forest' , rf)
]
#seed_skf = 42
skf = StratifiedKFold(n_splits = 10
, shuffle = True
#, random_state = seed_skf
, **rs)
#scores_df = pd.DataFrame()
fscoreL = []
mccL = []
presL = []
recallL = []
accuL = []
roc_aucL = []
# X_array = np.array(input_df)
# Y = np.array(target1)
# Y = target1
for train_index, test_index in skf.split(input_df, targetF):
print('\nSKF train index:', train_index
, '\nSKF test index:', test_index)
x_train_fold, x_test_fold = input_df.iloc[train_index], input_df.iloc[test_index]
y_train_fold, y_test_fold = targetF.iloc[train_index], targetF.iloc[test_index]
# for train_index, test_index in skf.split(X_array, Y):
# print('\nSKF train index:', train_index
# , '\nSKF test index:', test_index)
# x_train_fold, x_test_fold = X_array[train_index], X_array[test_index]
# y_train_fold, y_test_fold = Y[train_index], Y[test_index]
clf_scores_df = pd.DataFrame()
for clf_name, clf in clfs:
# model2 = Pipeline(steps=[('preprocess', MinMaxScaler())
# , ('classifier', clf)])
model2 = Pipeline(steps=[('preprocess', col_transform)
, ('classifier', clf)])
model2.fit(x_train_fold, y_train_fold)
y_pred_fold = model2.predict(x_test_fold)
#----------------
# Model metrics
#----------------
# F1-Score
fscore = f1_score(y_test_fold, y_pred_fold)
fscoreL.append(fscore)
# print('fscoreL Len: ', len(fscoreL))
fscoreM = mean(fscoreL)
# Matthews correlation coefficient
mcc = matthews_corrcoef(y_test_fold, y_pred_fold)
mccL.append(mcc)
mccM = mean(mccL)
# Precision
pres = precision_score(y_test_fold, y_pred_fold)
presL.append(pres)
presM = mean(presL)
# Recall
recall = recall_score(y_test_fold, y_pred_fold)
recallL.append(recall)
recallM = mean(recallL)
# Accuracy
accu = accuracy_score(y_test_fold, y_pred_fold)
accuL.append(accu)
accuM = mean(accuL)
# ROC_AUC
roc_auc = roc_auc_score(y_test_fold, y_pred_fold)
roc_aucL.append(roc_auc)
roc_aucM = mean(roc_aucL)
clf_scores_df = clf_scores_df.append({'Model': clf_name
,'F1_score' : fscoreM
, 'MCC' : mccM
, 'Precision': presM
, 'Recall' : recallM
, 'Accuracy' : accuM
, 'ROC_curve': roc_aucM}
, ignore_index = True)
#scores_df = scores_df.append(clf_scores_df)
#%% Call functions
tN_res = MultClassPipeline(X_trainN, X_testN, y_trainN, y_testN)
tN_res
t2_res = MultClassPipeline2(X_train, X_test, y_train, y_test, input_df = all_features_df)
t2_res
#CHECK: numbers are awfully close to each other!
t3_res = MultClassPipeSKF(input_df = numerical_features_df
, y_targetF = target1
, var_type = 'numerical'
, skf_splits = 10)
t3_res
#CHECK: numbers are awfully close to each other!
t4_res = MultClassPipeSKF(input_df = all_features_df
, y_targetF = target1
, var_type = 'mixed'
, skf_splits = 10)
t4_res

View file

@ -0,0 +1,39 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Mar 14 10:46:44 2022
@author: tanu
"""
# Link: https://laptrinhx.com/how-to-run-30-machine-learning-models-with-2-lines-of-code-1521663246/
import pyforest
import warnings
warnings.filterwarnings("ignore")
from sklearn import metrics
from sklearn.metrics import accuracy_score
import lazypredict
from lazypredict.Supervised import LazyClassifier
#%%
target = target1
#target = target3
X_trainN, X_testN, y_trainN, y_testN = train_test_split(numerical_features_df,
target,
test_size = 0.33,
random_state = 42)
#%%
clf = LazyClassifier(verbose=0,ignore_warnings=True)
modelsN, predictionsN = clf.fit(X_trainN, X_testN, y_trainN, y_testN)
mm_lpN = modelsN
#%%
# DOESN't work as need to incorporate pipeline(one hot encoder)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)
mm_lp = models
model1 = Pipeline(steps = [('preprocess', MinMaxScaler())
, ('multiModels', clf) ])
models, predictions = model1.fit(X_trainN, X_testN, y_trainN, y_testN)