ML_AI_training/my_datap4.py

293 lines
No EOL
10 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Feb 24 10:48:10 2022
@author: tanu
"""
###############################################################################
# questions:
# which data to use: merged_df3 or merged_df2
# which is the target? or_mychisq or drtype col
# scaling: can it be from -1 to 1?
# how to include the mutation information?
# 'wild_type', 'mutant', 'postion'
# whether to log transform the af and or cols
# to allow mean mode values to be imputed for validation set
# whether to calculate mean, median accounting for NA or removing them?
# strategy:
# available data = X_train
# available data but NAN = validation_test
# test data: mut generated not in mcsm
###############################################################################
import os, sys
import re
from sklearn.datasets import load_boston
from sklearn import datasets
from sklearn import linear_model
from sklearn import preprocessing
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
print(np.__version__)
print(pd.__version__)
from statistics import mean, stdev
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score
from sklearn.metrics import plot_precision_recall_curve
import itertools
#%% read data
homedir = os.path.expanduser("~")
os.chdir(homedir + "/git/ML_AI_training/test_data")
# this needs to be merged_df2 or merged_df3?
#gene 'pncA'
drug = 'pyrazinamide'
my_df = pd.read_csv("pnca_merged_df3.csv")
my_df.dtypes
my_df_cols = my_df.columns
#%%
# GET Y
# Y = my_df.loc[:,drug] #has NA
dm_om_map = {'DM': 1, 'OM': 0}
my_df['resistance'] = my_df['mutation_info_labels'].map(dm_om_map)
# sanity check
my_df['resistance'].value_counts()
my_df['mutation_info_labels'].value_counts()
Y = my_df['resistance']
#%%
# GET X
cols = my_df.columns
X = my_df[['ligand_distance', 'ligand_affinity_change', 'duet_stability_change', 'ddg_foldx', 'deepddg', 'ddg_dynamut2', 'consurf_score']]
#%%
####################################
# SIMPLEST case of train_test split
# Random forest
# one hot encoder
# MinMaxScaler
# https://towardsdatascience.com/my-random-forest-classifier-cheat-sheet-in-python-fedb84f8cf4f
####################################
seed = 50
X_train, X_test, y_train, y_test = train_test_split(X,Y
, test_size = 0.333
, random_state = seed)
features_to_encode = list(X_train.select_dtypes(include = ['object']).columns)
col_trans = make_column_transformer(
(OneHotEncoder(),features_to_encode),
remainder = "passthrough"
)
MinMaxS = preprocessing.MinMaxScaler()
standardS = preprocessing.StandardScaler()
rf_classifier = RandomForestClassifier(
min_samples_leaf=50,
n_estimators=150,
bootstrap=True,
oob_score=True,
n_jobs=-1,
random_state=seed,
max_features='auto')
pipe = make_pipeline(col_trans
#, MinMaxS
#, standardS
, rf_classifier)
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
accuracy_score(y_test, y_pred)
print("\nModel evaluation:\n")
print(f"Accuracy: {round(accuracy_score(y_test,y_pred),3)*100} %")
print(f"Recall: {round(recall_score(y_test,y_pred),3)*100} %")
print(f"Precision: {round(precision_score(y_test,y_pred),3)*100} %")
print(f"F1-score: {round(f1_score(y_test,y_pred),3)*100} %")
recall_score(y_test, y_pred)
precision_score(y_test, y_pred)
f1_score(y_test, y_pred)
roc_auc_score (y_test, y_pred) # not sure!
roc_curve(y_test, y_pred) # not sure!
disp = plot_precision_recall_curve(pipe, X_test, y_test)
train_probs = pipe.predict_proba(X_train)[:,1]
probs = pipe.predict_proba(X_test)[:, 1]
train_predictions = pipe.predict(X_train)
print(f'Train ROC AUC Score: {roc_auc_score(y_train, train_probs)}')
print(f'Test ROC AUC Score: {roc_auc_score(y_test, probs)}')
def evaluate_model(y_pred, probs,train_predictions, train_probs):
baseline = {}
baseline['recall']=recall_score(y_test,
[1 for _ in range(len(y_test))])
baseline['precision'] = precision_score(y_test,
[1 for _ in range(len(y_test))])
baseline['roc'] = 0.5
results = {}
results['recall'] = recall_score(y_test, y_pred)
results['precision'] = precision_score(y_test, y_pred)
results['roc'] = roc_auc_score(y_test, probs)
train_results = {}
train_results['recall'] = recall_score(y_train,
train_predictions)
train_results['precision'] = precision_score(y_train, train_predictions)
train_results['roc'] = roc_auc_score(y_train, train_probs)
# for metric in ['recall', 'precision', 'roc']:
# print(f"Baseline: {round(baseline[metric], 2)}Test: {round(results[metric], 2)} Train: {round(train_results[metric], 2)}")
# Calculate false positive rates and true positive rates
base_fpr, base_tpr, _ = roc_curve(y_test, [1 for _ in range(len(y_test))])
model_fpr, model_tpr, _ = roc_curve(y_test, probs)
plt.figure(figsize = (8, 6))
plt.rcParams['font.size'] = 16
# Plot both curves
plt.plot(base_fpr, base_tpr, 'b', label = 'baseline')
plt.plot(model_fpr, model_tpr, 'r', label = 'model')
plt.legend(); plt.xlabel('False Positive Rate');
plt.ylabel('True Positive Rate'); plt.title('ROC Curves');
plt.show()
# Recall Baseline: 1.0 Test: 0.92 Train: 0.93
# Precision Baseline: 0.48 Test: 0.9 Train: 0.91
# Roc Baseline: 0.5 Test: 0.97 Train: 0.97
evaluate_model(y_pred,probs,train_predictions,train_probs)
def plot_confusion_matrix(cm, classes, normalize = False,
title='Confusion matrix',
cmap=plt.cm.Greens): # can change color
plt.figure(figsize = (10, 10))
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title, size = 24)
plt.colorbar(aspect=4)
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45, size = 14)
plt.yticks(tick_marks, classes, size = 14)
fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
# Label the plot
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): plt.text(j, i, format(cm[i, j], fmt),
fontsize = 20,
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.grid(None)
plt.tight_layout()
plt.ylabel('True label', size = 18)
plt.xlabel('Predicted label', size = 18)
# Let's plot it out
cm = confusion_matrix(y_test, y_pred)
plot_confusion_matrix(cm, classes = ['0 - Susceptible', '1 - Resistant'],
title = 'R/S Confusion Matrix')
print(rf_classifier.feature_importances_)
print(f" There are {len(rf_classifier.feature_importances_)} features in total")
#%%
####################################
# Model 2: case of stratified K-fold
# Random forest
# MinMaxScaler
# https://towardsdatascience.com/stratified-k-fold-what-it-is-how-to-use-it-cf3d107d3ea2
####################################
print('Class Ratio:',
sum(Y)/len(Y))
print('Class Ratio:',
sum(my_df['resistance'])/len(my_df['resistance'])
)
seed_skf = 50
skf = StratifiedKFold(n_splits = 10
, shuffle = True
, random_state = seed_skf)
target = my_df.loc[:,'resistance']
df = my_df[['ligand_distance'
, 'ligand_affinity_change'
, 'duet_stability_change'
, 'ddg_foldx'
, 'deepddg'
, 'ddg_dynamut2'
, 'consurf_score'
, 'resistance']]
# To start with well just split our data and print the class ratio for
# each fold to check that they are all close to the full data set.
# Test set contains a single fold so we use the test split to determine the
# class ratio for each fold. You can see that each folds class ratio is close
# to the full data set which is obviously what we want
fold_no = 1 # to label the folds for printing output
for train_index, test_index in skf.split(df, target):
train = df.loc[train_index,:]
test = df.loc[test_index,:]
print('Fold',str(fold_no)
, 'Class Ratio:'
, sum(test['resistance'])/len(test['resistance']))
fold_no += 1
model_logisP = Pipeline(steps = [('preprocess', preprocessing.MinMaxScaler())
, ('logis', LogisticRegression(class_weight = 'balanced'))
])
model = LogisticRegression()
# Next well build a custom function that we can pass our data splits to for
# training and testing.
def train_model(train, test, fold_no):
X = my_df[['ligand_distance'
, 'ligand_affinity_change'
, 'duet_stability_change'
, 'ddg_foldx'
, 'deepddg'
, 'ddg_dynamut2'
, 'consurf_score']]
y = my_df.loc[:,'resistance']
X_train = train[X]
y_train = train[y]
X_test = test[X]
y_test = test[y]
model.fit(X_train,y_train)
predictions = model.predict(X_test)
print('Fold',str(fold_no),
'Accuracy:',
accuracy_score(y_test,predictions))
# Finally, lets modify the for loop we created above to call the build_model
# function on each of our splits.
fold_no = 1
for train_index, test_index in skf.split(df, target):
train = df.loc[train_index,:]
test = df.loc[test_index,:]
train_model(train,test,fold_no)
fold_no += 1