added my_data4 after outputting merged_df3 for pnca to test the ml models

This commit is contained in:
Tanushree Tunstall 2022-03-03 13:35:05 +00:00
parent 25a55ac914
commit 04e0267dd1
11 changed files with 5918 additions and 377 deletions

293
my_datap4.py Normal file
View file

@ -0,0 +1,293 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Feb 24 10:48:10 2022
@author: tanu
"""
###############################################################################
# questions:
# which data to use: merged_df3 or merged_df2
# which is the target? or_mychisq or drtype col
# scaling: can it be from -1 to 1?
# how to include the mutation information?
# 'wild_type', 'mutant', 'postion'
# whether to log transform the af and or cols
# to allow mean mode values to be imputed for validation set
# whether to calculate mean, median accounting for NA or removing them?
# strategy:
# available data = X_train
# available data but NAN = validation_test
# test data: mut generated not in mcsm
###############################################################################
import os, sys
import re
from sklearn.datasets import load_boston
from sklearn import datasets
from sklearn import linear_model
from sklearn import preprocessing
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
print(np.__version__)
print(pd.__version__)
from statistics import mean, stdev
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score
from sklearn.metrics import plot_precision_recall_curve
import itertools
#%% read data
homedir = os.path.expanduser("~")
os.chdir(homedir + "/git/ML_AI_training/test_data")
# this needs to be merged_df2 or merged_df3?
#gene 'pncA'
drug = 'pyrazinamide'
my_df = pd.read_csv("pnca_merged_df3.csv")
my_df.dtypes
my_df_cols = my_df.columns
#%%
# GET Y
# Y = my_df.loc[:,drug] #has NA
dm_om_map = {'DM': 1, 'OM': 0}
my_df['resistance'] = my_df['mutation_info_labels'].map(dm_om_map)
# sanity check
my_df['resistance'].value_counts()
my_df['mutation_info_labels'].value_counts()
Y = my_df['resistance']
#%%
# GET X
cols = my_df.columns
X = my_df[['ligand_distance', 'ligand_affinity_change', 'duet_stability_change', 'ddg_foldx', 'deepddg', 'ddg_dynamut2', 'consurf_score']]
#%%
####################################
# SIMPLEST case of train_test split
# Random forest
# one hot encoder
# MinMaxScaler
# https://towardsdatascience.com/my-random-forest-classifier-cheat-sheet-in-python-fedb84f8cf4f
####################################
seed = 50
X_train, X_test, y_train, y_test = train_test_split(X,Y
, test_size = 0.333
, random_state = seed)
features_to_encode = list(X_train.select_dtypes(include = ['object']).columns)
col_trans = make_column_transformer(
(OneHotEncoder(),features_to_encode),
remainder = "passthrough"
)
MinMaxS = preprocessing.MinMaxScaler()
standardS = preprocessing.StandardScaler()
rf_classifier = RandomForestClassifier(
min_samples_leaf=50,
n_estimators=150,
bootstrap=True,
oob_score=True,
n_jobs=-1,
random_state=seed,
max_features='auto')
pipe = make_pipeline(col_trans
#, MinMaxS
#, standardS
, rf_classifier)
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
accuracy_score(y_test, y_pred)
print("\nModel evaluation:\n")
print(f"Accuracy: {round(accuracy_score(y_test,y_pred),3)*100} %")
print(f"Recall: {round(recall_score(y_test,y_pred),3)*100} %")
print(f"Precision: {round(precision_score(y_test,y_pred),3)*100} %")
print(f"F1-score: {round(f1_score(y_test,y_pred),3)*100} %")
recall_score(y_test, y_pred)
precision_score(y_test, y_pred)
f1_score(y_test, y_pred)
roc_auc_score (y_test, y_pred) # not sure!
roc_curve(y_test, y_pred) # not sure!
disp = plot_precision_recall_curve(pipe, X_test, y_test)
train_probs = pipe.predict_proba(X_train)[:,1]
probs = pipe.predict_proba(X_test)[:, 1]
train_predictions = pipe.predict(X_train)
print(f'Train ROC AUC Score: {roc_auc_score(y_train, train_probs)}')
print(f'Test ROC AUC Score: {roc_auc_score(y_test, probs)}')
def evaluate_model(y_pred, probs,train_predictions, train_probs):
baseline = {}
baseline['recall']=recall_score(y_test,
[1 for _ in range(len(y_test))])
baseline['precision'] = precision_score(y_test,
[1 for _ in range(len(y_test))])
baseline['roc'] = 0.5
results = {}
results['recall'] = recall_score(y_test, y_pred)
results['precision'] = precision_score(y_test, y_pred)
results['roc'] = roc_auc_score(y_test, probs)
train_results = {}
train_results['recall'] = recall_score(y_train,
train_predictions)
train_results['precision'] = precision_score(y_train, train_predictions)
train_results['roc'] = roc_auc_score(y_train, train_probs)
# for metric in ['recall', 'precision', 'roc']:
# print(f"Baseline: {round(baseline[metric], 2)}Test: {round(results[metric], 2)} Train: {round(train_results[metric], 2)}")
# Calculate false positive rates and true positive rates
base_fpr, base_tpr, _ = roc_curve(y_test, [1 for _ in range(len(y_test))])
model_fpr, model_tpr, _ = roc_curve(y_test, probs)
plt.figure(figsize = (8, 6))
plt.rcParams['font.size'] = 16
# Plot both curves
plt.plot(base_fpr, base_tpr, 'b', label = 'baseline')
plt.plot(model_fpr, model_tpr, 'r', label = 'model')
plt.legend(); plt.xlabel('False Positive Rate');
plt.ylabel('True Positive Rate'); plt.title('ROC Curves');
plt.show()
# Recall Baseline: 1.0 Test: 0.92 Train: 0.93
# Precision Baseline: 0.48 Test: 0.9 Train: 0.91
# Roc Baseline: 0.5 Test: 0.97 Train: 0.97
evaluate_model(y_pred,probs,train_predictions,train_probs)
def plot_confusion_matrix(cm, classes, normalize = False,
title='Confusion matrix',
cmap=plt.cm.Greens): # can change color
plt.figure(figsize = (10, 10))
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title, size = 24)
plt.colorbar(aspect=4)
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45, size = 14)
plt.yticks(tick_marks, classes, size = 14)
fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
# Label the plot
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): plt.text(j, i, format(cm[i, j], fmt),
fontsize = 20,
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.grid(None)
plt.tight_layout()
plt.ylabel('True label', size = 18)
plt.xlabel('Predicted label', size = 18)
# Let's plot it out
cm = confusion_matrix(y_test, y_pred)
plot_confusion_matrix(cm, classes = ['0 - Susceptible', '1 - Resistant'],
title = 'R/S Confusion Matrix')
print(rf_classifier.feature_importances_)
print(f" There are {len(rf_classifier.feature_importances_)} features in total")
#%%
####################################
# Model 2: case of stratified K-fold
# Random forest
# MinMaxScaler
# https://towardsdatascience.com/stratified-k-fold-what-it-is-how-to-use-it-cf3d107d3ea2
####################################
print('Class Ratio:',
sum(Y)/len(Y))
print('Class Ratio:',
sum(my_df['resistance'])/len(my_df['resistance'])
)
seed_skf = 50
skf = StratifiedKFold(n_splits = 10
, shuffle = True
, random_state = seed_skf)
target = my_df.loc[:,'resistance']
df = my_df[['ligand_distance'
, 'ligand_affinity_change'
, 'duet_stability_change'
, 'ddg_foldx'
, 'deepddg'
, 'ddg_dynamut2'
, 'consurf_score'
, 'resistance']]
# To start with well just split our data and print the class ratio for
# each fold to check that they are all close to the full data set.
# Test set contains a single fold so we use the test split to determine the
# class ratio for each fold. You can see that each folds class ratio is close
# to the full data set which is obviously what we want
fold_no = 1 # to label the folds for printing output
for train_index, test_index in skf.split(df, target):
train = df.loc[train_index,:]
test = df.loc[test_index,:]
print('Fold',str(fold_no)
, 'Class Ratio:'
, sum(test['resistance'])/len(test['resistance']))
fold_no += 1
model_logisP = Pipeline(steps = [('preprocess', preprocessing.MinMaxScaler())
, ('logis', LogisticRegression(class_weight = 'balanced'))
])
model = LogisticRegression()
# Next well build a custom function that we can pass our data splits to for
# training and testing.
def train_model(train, test, fold_no):
X = my_df[['ligand_distance'
, 'ligand_affinity_change'
, 'duet_stability_change'
, 'ddg_foldx'
, 'deepddg'
, 'ddg_dynamut2'
, 'consurf_score']]
y = my_df.loc[:,'resistance']
X_train = train[X]
y_train = train[y]
X_test = test[X]
y_test = test[y]
model.fit(X_train,y_train)
predictions = model.predict(X_test)
print('Fold',str(fold_no),
'Accuracy:',
accuracy_score(y_test,predictions))
# Finally, lets modify the for loop we created above to call the build_model
# function on each of our splits.
fold_no = 1
for train_index, test_index in skf.split(df, target):
train = df.loc[train_index,:]
test = df.loc[test_index,:]
train_model(train,test,fold_no)
fold_no += 1