ML_AI_training/my_datap3.py

376 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Feb 24 10:48:10 2022
@author: tanu
"""
###############################################################################
# questions:
# which data to use: merged_df3 or merged_df2
# which is the target? or_mychisq or drtype col
# scaling: can it be from -1 to 1?
# how to include the mutation information?
# 'wild_type', 'mutant', 'postion'
# whether to log transform the af and or cols
# to allow mean mode values to be imputed for validation set
# whether to calculate mean, median accounting for NA or removing them?
# strategy:
# available data = X_train
# available data but NAN = validation_test
# test data: mut generated not in mcsm
###############################################################################
import os, sys
import re
from sklearn.datasets import load_boston
from sklearn import datasets
from sklearn import linear_model
from sklearn import preprocessing
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
print(np.__version__)
print(pd.__version__)
from statistics import mean, stdev
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score
#%% read data
homedir = os.path.expanduser("~")
os.chdir(homedir + "/git/ML_AI_training/test_data")
# this needs to be merged_df2 or merged_df3?
my_df = pd.read_csv("pnca_all_params.csv")
my_df.dtypes
my_df_cols = my_df.columns
omit_cols1 = ['pdb_file'
, 'seq_offset4pdb'
, 'mut_3upper'
, 'wild_pos'
, 'wild_chain_pos'
, 'chain'
, 'wt_3upper'
, 'consurf_colour'
, 'consurf_colour_rev'
, 'consurf_msa_data'
, 'consurf_aa_variety'
, 'snap2_accuracy_pc'
, 'beta_logistic'
, 'se_logistic'
, 'zval_logisitc'
, 'pval_chisq'
, 'log10_or_mychisq'
, 'neglog_pval_fisher'
, 'or_fisher'
, 'wild_type'
, 'mutant_type'
, 'position'
, 'ligand_id'
, 'mutation'
, 'ss'
, 'ss_class' # include it later?
, 'contacts'
]
omit_cols2 = list(my_df.columns[my_df.columns.str.contains(".*ci_.*") | my_df.columns.str.contains(".*_scaled*") | my_df.columns.str.contains(".*_outcome*")])
# [WATCH:] just to test since these have negative values!
omit_cols3 = list(my_df.columns[my_df.columns.str.contains("electro_.*") | my_df.columns.str.contains("disulfide_.*") | my_df.columns.str.contains("hbonds_.*") | my_df.columns.str.contains("partcov_.*") | my_df.columns.str.contains("vdwclashes.*") | my_df.columns.str.contains("volumetric.*")])
omit_cols = omit_cols1 + omit_cols2 + omit_cols3
# Filter df: Filter columns to focus on my selected ones
my_df_filt = my_df.loc[:, ~my_df.columns.isin(omit_cols)]
my_df_filt_cols = my_df_filt.columns
#Fill na of filtered df: fill NaNs with column means/medians in each column
my_df_filt2 = my_df_filt.fillna(my_df_filt.mean())
my_df_filt3 = my_df_filt.fillna(my_df_filt.median())
#my_df_filt_noNA = my_df_filt.fillna(0)
summ = my_df_filt.describe()
summ2 = my_df_filt2.describe()
summ3 = my_df_filt3.describe()
#summ_noNA = my_df_filt_noNA.describe()
########################
# [WATCH]: Drop na
# Get Y
my_df2 = my_df_filt.dropna()
my_df2['resistance'] = my_df2['or_mychisq'].apply(lambda x: 0 if x <=1 else 1)
my_df2['resistance'].value_counts()
Y = my_df2['resistance']
Y = np.array(Y)
#Y = Y.reset_index()
#Y = Y.drop(['index'], axis = 1)
#Y.value_counts()
#Y = np.array(Y)
# GET X
omit_cols_y = ['or_mychisq', 'resistance']
my_df_ml = my_df2.loc[:, ~my_df2.columns.isin(omit_cols_y)]
#my_df_ml = my_df_ml.set_index('mutationinformation')
X = my_df_ml
X = X.drop(['mutationinformation'], axis = 1)
X = np.array(X)
#X = X.reset_index()
# check dim
X.shape
Y.shape
my_df2 = my_df2.reset_index()
#####################
#https://stackoverflow.com/questions/49134338/kfolds-cross-validation-vs-train-test-split
rf = RandomForestClassifier(n_estimators=100, random_state=42)
#https://towardsdatascience.com/stratified-k-fold-what-it-is-how-to-use-it-cf3d107d3ea2
# k-FOLD
print('Class Ratio:',
sum(Y)/len(Y))
print('Class Ratio:',
sum(my_df2['resistance'])/len(my_df2['resistance'])
)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
target = my_df2.loc[:,'resistance']
fold_no = 1
for train_index, test_index in skf.split(my_df2, target):
train = my_df2.loc[train_index,:]
test = my_df2.loc[test_index,:]
print('Fold',str(fold_no),
'Class Ratio:',
sum(test['resistance'])/len(test['resistance']))
fold_no += 1
model_logisP = Pipeline(steps = [('preprocess', preprocessing.MinMaxScaler())
, ('logis', LogisticRegression(class_weight = 'unbalanced'))
])
X_features = X_train.columns.to_list()
def train_model(train, test, fold_no):
X = X_features
y = ['resistance']
X_train = train[X]
y_train = train[y]
X_test = test[X]
y_test = test[y]
model_logisP.fit(X_train,y_train)
predictions = model_logisP.predict(X_test)
print('Fold',str(fold_no),
'Accuracy:',
accuracy_score(y_test,predictions))
fold_no = 1
for train_index, test_index in skf.split(my_df2, target):
train = my_df2.loc[train_index,:]
test = my_df2.loc[test_index,:]
train_model(train,test,fold_no)
fold_no += 1
#%%
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=20)
lst_accu_stratified = []
scaler = preprocessing.MinMaxScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = X_scaled[:,[1,2,3,15,16]]
#lr = linear_model.LogisticRegression(class_weight = 'unbalanced')
lr = linear_model.LogisticRegression()
for train_index1, test_index1 in skf.split(X, Y):
#print(train_index)
#print(test_index)
x_train_fold1, x_test_fold1 = X_scaled[train_index1], X_scaled[test_index1]
y_train_fold1, y_test_fold1 = Y[train_index1], Y[test_index1]
lr.fit(x_train_fold1, y_train_fold1)
lst_accu_stratified.append(lr.score(x_test_fold1, y_test_fold1))
# print output
print('List of possible accuracy', lst_accu_stratified)
print('Max accuracy:', max(lst_accu_stratified)*100, "%")
print('Min accuracy:', min(lst_accu_stratified)*100, "%")
print('Mean accuracy:', mean(lst_accu_stratified)*100,"%")
print('St Dev:', stdev(lst_accu_stratified)*100,"%")
# cancer data
cancer = datasets.load_breast_cancer()
x = cancer.data
y = cancer.target
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
lst_accu_stratifiedC = []
scaler = preprocessing.MinMaxScaler()
x_scaled = scaler.fit_transform(x)
x_scaled = x_scaled[:,[1,2,3, 15, 16]]
for train_index, test_index in skf.split(x, y):
#print(train_index)
#print(test_index)
x_train_fold, x_test_fold = x_scaled[train_index], x_scaled[test_index]
y_train_fold, y_test_fold = y[train_index], y[test_index]
lr.fit(x_train_fold, y_train_fold)
lst_accu_stratifiedC.append(lr.score(x_test_fold, y_test_fold))
# print output
print('List of possible accuracy', lst_accu_stratifiedC)
print('Max accuracy:', max(lst_accu_stratifiedC)*100, "%")
print('Min accuracy:', min(lst_accu_stratifiedC)*100, "%")
print('Mean accuracy:', mean(lst_accu_stratifiedC)*100,"%")
print('St Dev:', stdev(lst_accu_stratifiedC)*100,"%")
#%%
##
# https://towardsdatascience.com/my-random-forest-classifier-cheat-sheet-in-python-fedb84f8cf4f
y_all = my_df_filt['or_mychisq'].apply(lambda x: 0 if x <=1 else 1)
X_all = my_df_filt.drop(['mutationinformation', 'or_mychisq'], axis = 1)
seed = 20 # so that the result is reproducible
X_all = my_df_filt.drop(['mutationinformation', 'or_mychisq'], axis = 1)
X_all = X_all.iloc[:,:6]
X_train, X_test, y_train, y_test = train_test_split(X_all,y_all
, test_size=0.333
, random_state = seed)
# Now, it is time to make NA a category.
# In Python, NaN is considered NAs.
# When encoded, those NaN will be ignored.
# Hence, it is useful to replace NaN with na, which is now a category called na.
# This will be taken into account when encoding later on.
#X_train = X_train.fillna('na')
#X_test = X_test.fillna('na')
X_train = X_train.fillna(X_train.median())
X_test = X_test.fillna(X_test.median())
X_train.dtypes
features_to_encode = list(X_train.select_dtypes(include = ['object']).columns)
col_trans = make_column_transformer(
(OneHotEncoder(),features_to_encode),
remainder = "passthrough"
)
rf_classifier = RandomForestClassifier(
min_samples_leaf=50,
n_estimators=150,
bootstrap=True,
oob_score=True,
n_jobs=-1,
random_state=seed,
max_features='auto')
pipe = make_pipeline(col_trans, rf_classifier)
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
accuracy_score(y_test, y_pred)
print(f"The accuracy of the model is {round(accuracy_score(y_test,y_pred),3)*100} %")
recall_score(y_test, y_pred)
precision_score(y_test, y_pred)
f1_score(y_test, y_pred)
roc_auc_score (y_test, y_pred)
roc_curve(y_test, y_pred)
train_probs = pipe.predict_proba(X_train)[:,1]
probs = pipe.predict_proba(X_test)[:, 1]
train_predictions = pipe.predict(X_train)
print(f'Train ROC AUC Score: {roc_auc_score(y_train, train_probs)}')
print(f'Test ROC AUC Score: {roc_auc_score(y_test, probs)}')
def evaluate_model(y_pred, probs,train_predictions, train_probs):
baseline = {}
baseline['recall']=recall_score(y_test,
[1 for _ in range(len(y_test))])
baseline['precision'] = precision_score(y_test,
[1 for _ in range(len(y_test))])
baseline['roc'] = 0.5
results = {}
results['recall'] = recall_score(y_test, y_pred)
results['precision'] = precision_score(y_test, y_pred)
results['roc'] = roc_auc_score(y_test, probs)
train_results = {}
train_results['recall'] = recall_score(y_train,
train_predictions)
train_results['precision'] = precision_score(y_train, train_predictions)
train_results['roc'] = roc_auc_score(y_train, train_probs)
# for metric in ['recall', 'precision', 'roc']:
# print(f"Baseline: {round(baseline[metric], 2)}Test: {round(results[metric], 2)} Train: {round(train_results[metric], 2)}")
# Calculate false positive rates and true positive rates
base_fpr, base_tpr, _ = roc_curve(y_test, [1 for _ in range(len(y_test))])
model_fpr, model_tpr, _ = roc_curve(y_test, probs)
plt.figure(figsize = (8, 6))
plt.rcParams['font.size'] = 16
# Plot both curves
plt.plot(base_fpr, base_tpr, 'b', label = 'baseline')
plt.plot(model_fpr, model_tpr, 'r', label = 'model')
plt.legend(); plt.xlabel('False Positive Rate');
plt.ylabel('True Positive Rate'); plt.title('ROC Curves');
plt.show()
# Recall Baseline: 1.0 Test: 0.92 Train: 0.93
# Precision Baseline: 0.48 Test: 0.9 Train: 0.91
# Roc Baseline: 0.5 Test: 0.97 Train: 0.97
evaluate_model(y_pred,probs,train_predictions,train_probs)
#%%
import itertools
def plot_confusion_matrix(cm, classes, normalize = False,
title='Confusion matrix',
cmap=plt.cm.Greens): # can change color
plt.figure(figsize = (10, 10))
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title, size = 24)
plt.colorbar(aspect=4)
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45, size = 14)
plt.yticks(tick_marks, classes, size = 14)
fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
# Label the plot
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): plt.text(j, i, format(cm[i, j], fmt),
fontsize = 20,
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.grid(None)
plt.tight_layout()
plt.ylabel('True label', size = 18)
plt.xlabel('Predicted label', size = 18)
# Let's plot it out
cm = confusion_matrix(y_test, y_pred)
plot_confusion_matrix(cm, classes = ['0 - Susceptible', '1 - Resistant'],
title = 'R/S Confusion Matrix')
print(rf_classifier.feature_importances_)
print(f" There are {len(rf_classifier.feature_importances_)} features in total")