renamed hyperparams to gscv
This commit is contained in:
parent
a82358dbb4
commit
ad5ebad7f8
31 changed files with 4433 additions and 0 deletions
361
earlier_versions/my_datap4.py
Normal file
361
earlier_versions/my_datap4.py
Normal file
|
@ -0,0 +1,361 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Thu Feb 24 10:48:10 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
###############################################################################
|
||||
# questions:
|
||||
# which data to use: merged_df3 or merged_df2
|
||||
# which is the target? or_mychisq or drtype col
|
||||
# scaling: can it be from -1 to 1?
|
||||
# how to include the mutation information?
|
||||
# 'wild_type', 'mutant', 'postion'
|
||||
# whether to log transform the af and or cols
|
||||
# to allow mean mode values to be imputed for validation set
|
||||
# whether to calculate mean, median accounting for NA or removing them?
|
||||
|
||||
# strategy:
|
||||
# available data = X_train
|
||||
# available data but NAN = validation_test
|
||||
# test data: mut generated not in mcsm
|
||||
|
||||
###############################################################################
|
||||
import os, sys
|
||||
import re
|
||||
from sklearn.datasets import load_boston
|
||||
from sklearn import datasets
|
||||
from sklearn import linear_model
|
||||
from sklearn import preprocessing
|
||||
import pandas as pd
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
print(np.__version__)
|
||||
print(pd.__version__)
|
||||
from statistics import mean, stdev
|
||||
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.model_selection import cross_validate
|
||||
from sklearn.metrics import make_scorer
|
||||
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.pipeline import make_pipeline
|
||||
from sklearn.datasets import load_digits
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
from sklearn.metrics import confusion_matrix
|
||||
from sklearn.model_selection import StratifiedKFold
|
||||
|
||||
from sklearn.preprocessing import OneHotEncoder
|
||||
from sklearn.compose import make_column_transformer
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
|
||||
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score
|
||||
from sklearn.metrics import plot_precision_recall_curve
|
||||
import itertools
|
||||
#%% read data
|
||||
homedir = os.path.expanduser("~")
|
||||
os.chdir(homedir + "/git/ML_AI_training/test_data")
|
||||
|
||||
# this needs to be merged_df2 or merged_df3?
|
||||
#gene 'pncA'
|
||||
drug = 'pyrazinamide'
|
||||
|
||||
my_df = pd.read_csv("pnca_merged_df3.csv")
|
||||
|
||||
my_df.dtypes
|
||||
my_df_cols = my_df.columns
|
||||
|
||||
#%%
|
||||
# GET Y
|
||||
# Y = my_df.loc[:,drug] #has NA
|
||||
dm_om_map = {'DM': 1, 'OM': 0}
|
||||
my_df['resistance'] = my_df['mutation_info_labels'].map(dm_om_map)
|
||||
|
||||
# sanity check
|
||||
my_df['resistance'].value_counts()
|
||||
my_df['mutation_info_labels'].value_counts()
|
||||
|
||||
Y = my_df['resistance']
|
||||
|
||||
#%%
|
||||
# GET X
|
||||
cols = my_df.columns
|
||||
X = my_df[['ligand_distance'
|
||||
, 'ligand_affinity_change'
|
||||
, 'duet_stability_change', 'ddg_foldx', 'deepddg', 'ddg_dynamut2', 'consurf_score'
|
||||
, 'snap2_score'
|
||||
#, 'snap2_accuracy_pc'
|
||||
, 'asa'
|
||||
, 'rsa']]
|
||||
|
||||
#%%
|
||||
####################################
|
||||
# SIMPLEST case of train_test split
|
||||
# Random forest
|
||||
# one hot encoder
|
||||
# MinMaxScaler
|
||||
# https://towardsdatascience.com/my-random-forest-classifier-cheat-sheet-in-python-fedb84f8cf4f
|
||||
####################################
|
||||
seed = 50
|
||||
X_train, X_test, y_train, y_test = train_test_split(X,Y
|
||||
, test_size = 0.333
|
||||
, random_state = seed)
|
||||
|
||||
features_to_encode = list(X_train.select_dtypes(include = ['object']).columns)
|
||||
|
||||
col_trans = make_column_transformer(
|
||||
(OneHotEncoder(),features_to_encode),
|
||||
remainder = "passthrough"
|
||||
)
|
||||
MinMaxS = preprocessing.MinMaxScaler()
|
||||
standardS = preprocessing.StandardScaler()
|
||||
|
||||
rf_classifier = RandomForestClassifier(
|
||||
min_samples_leaf=50,
|
||||
n_estimators=150,
|
||||
bootstrap=True,
|
||||
oob_score=True,
|
||||
n_jobs=-1,
|
||||
random_state=seed,
|
||||
max_features='auto')
|
||||
|
||||
pipe = make_pipeline(col_trans
|
||||
#, MinMaxS
|
||||
#, standardS
|
||||
, rf_classifier)
|
||||
|
||||
pipe.fit(X_train, y_train)
|
||||
y_pred = pipe.predict(X_test)
|
||||
accuracy_score(y_test, y_pred)
|
||||
|
||||
print("\nModel evaluation:\n")
|
||||
print(f"Accuracy: {round(accuracy_score(y_test,y_pred),3)*100} %")
|
||||
print(f"Recall: {round(recall_score(y_test,y_pred),3)*100} %")
|
||||
print(f"Precision: {round(precision_score(y_test,y_pred),3)*100} %")
|
||||
print(f"F1-score: {round(f1_score(y_test,y_pred),3)*100} %")
|
||||
|
||||
recall_score(y_test, y_pred)
|
||||
precision_score(y_test, y_pred)
|
||||
f1_score(y_test, y_pred)
|
||||
roc_auc_score (y_test, y_pred) # not sure!
|
||||
roc_curve(y_test, y_pred) # not sure!
|
||||
|
||||
disp = plot_precision_recall_curve(pipe, X_test, y_test)
|
||||
|
||||
train_probs = pipe.predict_proba(X_train)[:,1]
|
||||
probs = pipe.predict_proba(X_test)[:, 1]
|
||||
train_predictions = pipe.predict(X_train)
|
||||
|
||||
print(f'Train ROC AUC Score: {roc_auc_score(y_train, train_probs)}')
|
||||
print(f'Test ROC AUC Score: {roc_auc_score(y_test, probs)}')
|
||||
|
||||
def evaluate_model(y_pred, probs,train_predictions, train_probs):
|
||||
baseline = {}
|
||||
baseline['recall']=recall_score(y_test,
|
||||
[1 for _ in range(len(y_test))])
|
||||
baseline['precision'] = precision_score(y_test,
|
||||
[1 for _ in range(len(y_test))])
|
||||
baseline['roc'] = 0.5
|
||||
results = {}
|
||||
results['recall'] = recall_score(y_test, y_pred)
|
||||
results['precision'] = precision_score(y_test, y_pred)
|
||||
results['roc'] = roc_auc_score(y_test, probs)
|
||||
train_results = {}
|
||||
train_results['recall'] = recall_score(y_train,
|
||||
train_predictions)
|
||||
train_results['precision'] = precision_score(y_train, train_predictions)
|
||||
train_results['roc'] = roc_auc_score(y_train, train_probs)
|
||||
# for metric in ['recall', 'precision', 'roc']:
|
||||
# print(f"Baseline: {round(baseline[metric], 2)}Test: {round(results[metric], 2)} Train: {round(train_results[metric], 2)}")
|
||||
|
||||
# Calculate false positive rates and true positive rates
|
||||
base_fpr, base_tpr, _ = roc_curve(y_test, [1 for _ in range(len(y_test))])
|
||||
model_fpr, model_tpr, _ = roc_curve(y_test, probs)
|
||||
plt.figure(figsize = (8, 6))
|
||||
plt.rcParams['font.size'] = 16
|
||||
# Plot both curves
|
||||
plt.plot(base_fpr, base_tpr, 'b', label = 'baseline')
|
||||
plt.plot(model_fpr, model_tpr, 'r', label = 'model')
|
||||
plt.legend(); plt.xlabel('False Positive Rate');
|
||||
plt.ylabel('True Positive Rate'); plt.title('ROC Curves');
|
||||
plt.show()
|
||||
|
||||
# Recall Baseline: 1.0 Test: 0.92 Train: 0.93
|
||||
# Precision Baseline: 0.48 Test: 0.9 Train: 0.91
|
||||
# Roc Baseline: 0.5 Test: 0.97 Train: 0.97
|
||||
|
||||
evaluate_model(y_pred,probs,train_predictions,train_probs)
|
||||
|
||||
def plot_confusion_matrix(cm, classes, normalize = False,
|
||||
title='Confusion matrix',
|
||||
cmap=plt.cm.Greens): # can change color
|
||||
plt.figure(figsize = (10, 10))
|
||||
plt.imshow(cm, interpolation='nearest', cmap=cmap)
|
||||
plt.title(title, size = 24)
|
||||
plt.colorbar(aspect=4)
|
||||
tick_marks = np.arange(len(classes))
|
||||
plt.xticks(tick_marks, classes, rotation=45, size = 14)
|
||||
plt.yticks(tick_marks, classes, size = 14)
|
||||
fmt = '.2f' if normalize else 'd'
|
||||
thresh = cm.max() / 2.
|
||||
# Label the plot
|
||||
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): plt.text(j, i, format(cm[i, j], fmt),
|
||||
fontsize = 20,
|
||||
horizontalalignment="center",
|
||||
color="white" if cm[i, j] > thresh else "black")
|
||||
plt.grid(None)
|
||||
plt.tight_layout()
|
||||
plt.ylabel('True label', size = 18)
|
||||
plt.xlabel('Predicted label', size = 18)
|
||||
# Let's plot it out
|
||||
cm = confusion_matrix(y_test, y_pred)
|
||||
plot_confusion_matrix(cm, classes = ['0 - Susceptible', '1 - Resistant'],
|
||||
title = 'R/S Confusion Matrix')
|
||||
|
||||
print(rf_classifier.feature_importances_)
|
||||
print(f" There are {len(rf_classifier.feature_importances_)} features in total")
|
||||
#%%
|
||||
####################################
|
||||
# Model 2: case of stratified K-fold
|
||||
# Logistic regression
|
||||
# MinMaxScaler
|
||||
# https://towardsdatascience.com/stratified-k-fold-what-it-is-how-to-use-it-cf3d107d3ea2 [ Didn't work!]
|
||||
# https://www.geeksforgeeks.org/stratified-k-fold-cross-validation/
|
||||
####################################
|
||||
print('Class Ratio:',
|
||||
sum(Y)/len(Y))
|
||||
|
||||
print('Class Ratio:',
|
||||
sum(my_df['resistance'])/len(my_df['resistance']))
|
||||
|
||||
seed_skf = 50
|
||||
skf = StratifiedKFold(n_splits = 10
|
||||
, shuffle = True
|
||||
, random_state = seed_skf)
|
||||
|
||||
lst_accu_stratified = []
|
||||
scaler = preprocessing.MinMaxScaler()
|
||||
X_scaled = scaler.fit_transform(X)
|
||||
#X_scaled = X_scaled[:,[1,2,3]]
|
||||
|
||||
#lr = linear_model.LogisticRegression(class_weight = 'unbalanced')
|
||||
lr = linear_model.LogisticRegression()
|
||||
|
||||
for train_index, test_index in skf.split(X, Y):
|
||||
#print(train_index)
|
||||
#print(test_index)
|
||||
x_train_fold, x_test_fold = X_scaled[train_index], X_scaled[test_index]
|
||||
y_train_fold, y_test_fold = Y[train_index], Y[test_index]
|
||||
lr.fit(x_train_fold, y_train_fold)
|
||||
lst_accu_stratified.append(lr.score(x_test_fold, y_test_fold))
|
||||
|
||||
# print output
|
||||
print('List of possible accuracy', lst_accu_stratified)
|
||||
print('Max accuracy:', max(lst_accu_stratified)*100, "%")
|
||||
print('Min accuracy:', min(lst_accu_stratified)*100, "%")
|
||||
print('Mean accuracy:', mean(lst_accu_stratified)*100,"%")
|
||||
print('St Dev:', stdev(lst_accu_stratified)*100,"%")
|
||||
|
||||
#%%
|
||||
#--------------------------------------
|
||||
# Model2.1: same one but with pipeline
|
||||
# slightly different results when using
|
||||
# transformed or untransformed values!
|
||||
#--------------------------------------
|
||||
model_logisP = Pipeline(steps = [('preprocess', preprocessing.MinMaxScaler())
|
||||
, ('logis', LogisticRegression(class_weight = 'unbalanced')) ]) # changes stdev
|
||||
seed_skf = 50
|
||||
skf = StratifiedKFold(n_splits = 10
|
||||
, shuffle = True
|
||||
, random_state = seed_skf)
|
||||
|
||||
X_array = np.array(X)
|
||||
lst_accu_stratified = []
|
||||
for train_index, test_index in skf.split(X_array, Y):
|
||||
x_train_fold, x_test_fold = X_array[train_index], X_array[test_index]
|
||||
y_train_fold, y_test_fold = Y[train_index], Y[test_index]
|
||||
model_logisP.fit(x_train_fold, y_train_fold)
|
||||
lst_accu_stratified.append(model_logisP.score(x_test_fold, y_test_fold))
|
||||
|
||||
# print output
|
||||
print('List of possible accuracy', lst_accu_stratified)
|
||||
print('Max accuracy:', max(lst_accu_stratified)*100, "%")
|
||||
print('Min accuracy:', min(lst_accu_stratified)*100, "%")
|
||||
print('Mean accuracy:', mean(lst_accu_stratified)*100,"%")
|
||||
print('St Dev:', stdev(lst_accu_stratified)*100,"%")
|
||||
|
||||
####################################
|
||||
# Model 3: stratified K-fold
|
||||
# Random forest
|
||||
# MinMaxScaler
|
||||
# X: needs to be an array for str Kfold
|
||||
####################################
|
||||
|
||||
model_rf = Pipeline(steps = [('preprocess', preprocessing.MinMaxScaler())
|
||||
, ('rf' , RandomForestClassifier(n_estimators=100, random_state=42))])
|
||||
seed_skf = 50
|
||||
skf = StratifiedKFold(n_splits = 10
|
||||
, shuffle = True
|
||||
, random_state = seed_skf)
|
||||
|
||||
X_array = np.array(X)
|
||||
lst_accu_stratified_rf = []
|
||||
for train_index, test_index in skf.split(X_array, Y):
|
||||
x_train_fold, x_test_fold = X_array[train_index], X_array[test_index]
|
||||
y_train_fold, y_test_fold = Y[train_index], Y[test_index]
|
||||
model_rf.fit(x_train_fold, y_train_fold)
|
||||
lst_accu_stratified_rf.append(model_rf.score(x_test_fold, y_test_fold))
|
||||
|
||||
# print output
|
||||
print('List of possible accuracy', lst_accu_stratified_rf)
|
||||
print('Max accuracy:', max(lst_accu_stratified_rf)*100, "%")
|
||||
print('Min accuracy:', min(lst_accu_stratified_rf)*100, "%")
|
||||
print('Mean accuracy:', mean(lst_accu_stratified_rf)*100,"%")
|
||||
print('St Dev:', stdev(lst_accu_stratified_rf)*100,"%")
|
||||
|
||||
####################################
|
||||
# Model 4: Cross validate K-fold
|
||||
# Random forest
|
||||
# MinMaxScaler
|
||||
# X: needs to be an array for Kfold
|
||||
# FIXME: DOESNT WORK BECAUSE MSE is for LR, not Logistic or random?
|
||||
####################################
|
||||
from sklearn.metrics import mean_squared_error, make_scorer
|
||||
from sklearn.model_selection import cross_validate
|
||||
|
||||
score_fn = make_scorer(mean_squared_error)
|
||||
scores = cross_validate(model_rf, X_train, y_train
|
||||
, scoring = score_fn
|
||||
, cv = 10)
|
||||
|
||||
from itertools import combinations
|
||||
def train(X):
|
||||
return cross_validate(model_rf, X, y_train
|
||||
, scoring = score_fn
|
||||
, cv = 10
|
||||
, return_estimator = True)['test_score']
|
||||
scores = [train(X_train.loc[:,vars]) for vars in combinations(X_train.columns,11)]
|
||||
means = [score.mean() for score in scores]
|
||||
#%%
|
||||
# https://stackoverflow.com/questions/52316237/finding-logistic-regression-weights-from-k-fold-cv
|
||||
from sklearn.linear_model import LogisticRegressionCV
|
||||
from sklearn.model_selection import KFold
|
||||
kf = KFold(n_splits=10, shuffle=True, random_state=42)
|
||||
logistic = LogisticRegressionCV(Cs=2, fit_intercept=True, cv=kf, verbose =1, random_state=42)
|
||||
logistic.fit(X_train, y_train)
|
||||
print("Train Coefficient:" , logistic.coef_) #weights of each feature
|
||||
print("Train Intercept:" , logistic.intercept_) #value of intercept
|
||||
#%%
|
||||
# https://machinelearningmastery.com/repeated-k-fold-cross-validation-with-python/
|
||||
from sklearn.model_selection import cross_val_score
|
||||
from numpy import std
|
||||
cv = KFold(n_splits=10, random_state=1, shuffle=True)
|
||||
scores = cross_val_score(model_rf, X,Y, scoring='accuracy', cv=cv, n_jobs=-1)
|
||||
scores2 = cross_val_score(model_logisP, X, Y, scoring='accuracy', cv=cv, n_jobs=-1)
|
||||
# report performance
|
||||
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))
|
||||
print('Accuracy: %.3f (%.3f)' % (mean(scores2), stdev(scores2)))
|
Loading…
Add table
Add a link
Reference in a new issue