added practice scripts 2 and 3 to test different methods
This commit is contained in:
parent
9d46613ca4
commit
25a55ac914
2 changed files with 555 additions and 0 deletions
179
my_datap2.py
Normal file
179
my_datap2.py
Normal file
|
@ -0,0 +1,179 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Thu Feb 24 10:48:10 2022
|
||||||
|
|
||||||
|
@author: tanu
|
||||||
|
"""
|
||||||
|
###############################################################################
|
||||||
|
# questions:
|
||||||
|
# which data to use: merged_df3 or merged_df2
|
||||||
|
# which is the target? or_mychisq or drtype col
|
||||||
|
# scaling: can it be from -1 to 1?
|
||||||
|
# how to include the mutation information?
|
||||||
|
# 'wild_type', 'mutant', 'postion'
|
||||||
|
# whether to log transform the af and or cols
|
||||||
|
# to allow mean mode values to be imputed for validation set
|
||||||
|
# whether to calculate mean, median accounting for NA or removing them?
|
||||||
|
|
||||||
|
# strategy:
|
||||||
|
# available data = X_train
|
||||||
|
# available data but NAN = validation_test
|
||||||
|
# test data: mut generated not in mcsm
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
import os, sys
|
||||||
|
import re
|
||||||
|
from sklearn.datasets import load_boston
|
||||||
|
from sklearn import linear_model
|
||||||
|
from sklearn import preprocessing
|
||||||
|
import pandas as pd
|
||||||
|
import seaborn as sns
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import numpy as np
|
||||||
|
print(np.__version__)
|
||||||
|
print(pd.__version__)
|
||||||
|
#%% read data
|
||||||
|
homedir = os.path.expanduser("~")
|
||||||
|
os.chdir(homedir + "/git/ML_AI_training/test_data")
|
||||||
|
|
||||||
|
# this needs to be merged_df2 or merged_df3?
|
||||||
|
my_df = pd.read_csv("pnca_all_params.csv")
|
||||||
|
|
||||||
|
my_df.dtypes
|
||||||
|
my_df_cols = my_df.columns
|
||||||
|
|
||||||
|
omit_cols1 = ['pdb_file'
|
||||||
|
, 'seq_offset4pdb'
|
||||||
|
, 'mut_3upper'
|
||||||
|
, 'wild_pos'
|
||||||
|
, 'wild_chain_pos'
|
||||||
|
, 'chain'
|
||||||
|
, 'wt_3upper'
|
||||||
|
, 'consurf_colour'
|
||||||
|
, 'consurf_colour_rev'
|
||||||
|
, 'consurf_msa_data'
|
||||||
|
, 'consurf_aa_variety'
|
||||||
|
, 'snap2_accuracy_pc'
|
||||||
|
, 'beta_logistic'
|
||||||
|
, 'se_logistic'
|
||||||
|
, 'zval_logisitc'
|
||||||
|
, 'pval_chisq'
|
||||||
|
, 'log10_or_mychisq'
|
||||||
|
, 'neglog_pval_fisher'
|
||||||
|
, 'or_fisher'
|
||||||
|
, 'wild_type'
|
||||||
|
, 'mutant_type'
|
||||||
|
, 'position'
|
||||||
|
, 'ligand_id'
|
||||||
|
, 'mutation'
|
||||||
|
, 'ss'
|
||||||
|
, 'ss_class' # include it later?
|
||||||
|
, 'contacts'
|
||||||
|
]
|
||||||
|
|
||||||
|
omit_cols2 = list(my_df.columns[my_df.columns.str.contains(".*ci_.*") | my_df.columns.str.contains(".*_scaled*") | my_df.columns.str.contains(".*_outcome*")])
|
||||||
|
|
||||||
|
# [WATCH:] just to test since these have negative values!
|
||||||
|
omit_cols3 = list(my_df.columns[my_df.columns.str.contains("electro_.*") | my_df.columns.str.contains("disulfide_.*") | my_df.columns.str.contains("hbonds_.*") | my_df.columns.str.contains("partcov_.*") | my_df.columns.str.contains("vdwclashes.*") | my_df.columns.str.contains("volumetric.*")])
|
||||||
|
|
||||||
|
omit_cols = omit_cols1 + omit_cols2 + omit_cols3
|
||||||
|
|
||||||
|
my_df_filt = my_df.loc[:, ~my_df.columns.isin(omit_cols)]
|
||||||
|
my_df_filt_cols = my_df_filt.columns
|
||||||
|
|
||||||
|
#fill NaNs with column means in each column
|
||||||
|
my_df_filt2 = my_df_filt.fillna(my_df_filt.mean())
|
||||||
|
my_df_filt3 = my_df_filt.fillna(my_df_filt.median())
|
||||||
|
|
||||||
|
my_df_filt_noNA = my_df_filt.fillna(0)
|
||||||
|
|
||||||
|
summ = my_df_filt.describe()
|
||||||
|
summ_noNA = my_df_filt_noNA.describe()
|
||||||
|
|
||||||
|
foo = my_df_filt['or_mychisq'].value_counts()
|
||||||
|
foo = foo.to_frame()
|
||||||
|
|
||||||
|
########################
|
||||||
|
# [WATCH]: Drop na
|
||||||
|
my_df2 = my_df_filt3.dropna()
|
||||||
|
my_df2['resistance'] = my_df2['or_mychisq'].apply(lambda x: 0 if x <=1 else 1)
|
||||||
|
my_df2['resistance'].value_counts()
|
||||||
|
y = my_df2['resistance']
|
||||||
|
y.value_counts()
|
||||||
|
|
||||||
|
|
||||||
|
#%%============================================================================
|
||||||
|
X_validation_muts = my_df['mutationinformation'][~my_df['mutationinformation'].isin(my_df2['mutationinformation'])]
|
||||||
|
X_validation_all = my_df_filt3[~my_df_filt3['mutationinformation'].isin(my_df2['mutationinformation'])]
|
||||||
|
X_validation_f = X_validation_all.loc[:, ~X_validation_all.columns.isin(['or_mychisq', 'resistance'])]
|
||||||
|
X_validation = X_validation_f.set_index('mutationinformation')
|
||||||
|
|
||||||
|
#%% fill na in cols with mean value
|
||||||
|
X_validation.info()
|
||||||
|
X_validation.isna().any()
|
||||||
|
|
||||||
|
na_df = X_validation_f[X_validation_f.columns[X_validation_f.isna().any()]]
|
||||||
|
na_colnames = X_validation_f.columns[X_validation_f.isna().any()]
|
||||||
|
na_colsL = list(na_colnames)
|
||||||
|
|
||||||
|
#==============================================================================
|
||||||
|
omit_cols_y = ['or_mychisq', 'resistance']
|
||||||
|
my_df_ml = my_df2.loc[:, ~my_df2.columns.isin(omit_cols_y)]
|
||||||
|
#%%############################################################################
|
||||||
|
X_train = my_df_ml.set_index('mutationinformation')
|
||||||
|
#X_train = X_train.iloc[:,:4]
|
||||||
|
y_train = y
|
||||||
|
#X_train = X_train.dropna()
|
||||||
|
#y_train = y.dropna()
|
||||||
|
|
||||||
|
# check dim
|
||||||
|
X_train.shape
|
||||||
|
y_train.shape
|
||||||
|
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
from sklearn.pipeline import Pipeline
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
from sklearn.model_selection import cross_validate
|
||||||
|
from sklearn.metrics import make_scorer
|
||||||
|
from sklearn.metrics import accuracy_score, precision_score, recall_score
|
||||||
|
|
||||||
|
model_logisP = Pipeline(steps = [('preprocess', preprocessing.MinMaxScaler())
|
||||||
|
, ('logis', LogisticRegression(class_weight = 'unbalanced'))
|
||||||
|
])
|
||||||
|
|
||||||
|
model_logisP.fit(X_train, y_train)
|
||||||
|
fitted_vals = model_logisP.predict(X_train)
|
||||||
|
fitted_vals
|
||||||
|
|
||||||
|
# gives the array of predictions
|
||||||
|
model_logisP.predict(X_train)
|
||||||
|
model_logisP.predict(X_validation)
|
||||||
|
y_pred = model_logisP.predict(X_train)
|
||||||
|
y_pred2 = model_logisP.predict(X_validation)
|
||||||
|
|
||||||
|
accuracy_score(y_train,y_pred2)
|
||||||
|
precision_score(y_train,y_pred2, pos_label = 1)# tp/(tp + fp)
|
||||||
|
recall_score(y_train,y_pred2, pos_label = 1) # tp/(tp + fn)
|
||||||
|
|
||||||
|
|
||||||
|
################
|
||||||
|
acc = make_scorer(accuracy_score)
|
||||||
|
def precision(y_true,y_pred):
|
||||||
|
return precision_score(y_true,y_pred,pos_label = 1) #0
|
||||||
|
|
||||||
|
def recall(y_true,y_pred):
|
||||||
|
return recall_score(y_true, y_pred, pos_label = 1) #0
|
||||||
|
|
||||||
|
prec = make_scorer(precision)
|
||||||
|
rec = make_scorer(recall)
|
||||||
|
output = cross_validate(model_logisP
|
||||||
|
, X_train
|
||||||
|
, y
|
||||||
|
, scoring = {'acc' : acc
|
||||||
|
,'prec' : prec
|
||||||
|
,'rec' : rec}
|
||||||
|
, cv = 10, return_train_score = False)
|
||||||
|
|
||||||
|
pd.DataFrame(output).mean()
|
376
my_datap3.py
Normal file
376
my_datap3.py
Normal file
|
@ -0,0 +1,376 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Thu Feb 24 10:48:10 2022
|
||||||
|
|
||||||
|
@author: tanu
|
||||||
|
"""
|
||||||
|
###############################################################################
|
||||||
|
# questions:
|
||||||
|
# which data to use: merged_df3 or merged_df2
|
||||||
|
# which is the target? or_mychisq or drtype col
|
||||||
|
# scaling: can it be from -1 to 1?
|
||||||
|
# how to include the mutation information?
|
||||||
|
# 'wild_type', 'mutant', 'postion'
|
||||||
|
# whether to log transform the af and or cols
|
||||||
|
# to allow mean mode values to be imputed for validation set
|
||||||
|
# whether to calculate mean, median accounting for NA or removing them?
|
||||||
|
|
||||||
|
# strategy:
|
||||||
|
# available data = X_train
|
||||||
|
# available data but NAN = validation_test
|
||||||
|
# test data: mut generated not in mcsm
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
import os, sys
|
||||||
|
import re
|
||||||
|
from sklearn.datasets import load_boston
|
||||||
|
from sklearn import datasets
|
||||||
|
from sklearn import linear_model
|
||||||
|
from sklearn import preprocessing
|
||||||
|
import pandas as pd
|
||||||
|
import seaborn as sns
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import numpy as np
|
||||||
|
print(np.__version__)
|
||||||
|
print(pd.__version__)
|
||||||
|
from statistics import mean, stdev
|
||||||
|
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
from sklearn.model_selection import cross_validate
|
||||||
|
from sklearn.metrics import make_scorer
|
||||||
|
|
||||||
|
from sklearn.ensemble import RandomForestClassifier
|
||||||
|
from sklearn.pipeline import Pipeline
|
||||||
|
from sklearn.pipeline import make_pipeline
|
||||||
|
from sklearn.datasets import load_digits
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
from sklearn.model_selection import GridSearchCV
|
||||||
|
from sklearn.metrics import confusion_matrix
|
||||||
|
from sklearn.model_selection import StratifiedKFold
|
||||||
|
|
||||||
|
from sklearn.preprocessing import OneHotEncoder
|
||||||
|
from sklearn.compose import make_column_transformer
|
||||||
|
from sklearn.ensemble import RandomForestClassifier
|
||||||
|
|
||||||
|
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score
|
||||||
|
|
||||||
|
#%% read data
|
||||||
|
homedir = os.path.expanduser("~")
|
||||||
|
os.chdir(homedir + "/git/ML_AI_training/test_data")
|
||||||
|
|
||||||
|
# this needs to be merged_df2 or merged_df3?
|
||||||
|
my_df = pd.read_csv("pnca_all_params.csv")
|
||||||
|
|
||||||
|
my_df.dtypes
|
||||||
|
my_df_cols = my_df.columns
|
||||||
|
|
||||||
|
omit_cols1 = ['pdb_file'
|
||||||
|
, 'seq_offset4pdb'
|
||||||
|
, 'mut_3upper'
|
||||||
|
, 'wild_pos'
|
||||||
|
, 'wild_chain_pos'
|
||||||
|
, 'chain'
|
||||||
|
, 'wt_3upper'
|
||||||
|
, 'consurf_colour'
|
||||||
|
, 'consurf_colour_rev'
|
||||||
|
, 'consurf_msa_data'
|
||||||
|
, 'consurf_aa_variety'
|
||||||
|
, 'snap2_accuracy_pc'
|
||||||
|
, 'beta_logistic'
|
||||||
|
, 'se_logistic'
|
||||||
|
, 'zval_logisitc'
|
||||||
|
, 'pval_chisq'
|
||||||
|
, 'log10_or_mychisq'
|
||||||
|
, 'neglog_pval_fisher'
|
||||||
|
, 'or_fisher'
|
||||||
|
, 'wild_type'
|
||||||
|
, 'mutant_type'
|
||||||
|
, 'position'
|
||||||
|
, 'ligand_id'
|
||||||
|
, 'mutation'
|
||||||
|
, 'ss'
|
||||||
|
, 'ss_class' # include it later?
|
||||||
|
, 'contacts'
|
||||||
|
]
|
||||||
|
|
||||||
|
omit_cols2 = list(my_df.columns[my_df.columns.str.contains(".*ci_.*") | my_df.columns.str.contains(".*_scaled*") | my_df.columns.str.contains(".*_outcome*")])
|
||||||
|
|
||||||
|
# [WATCH:] just to test since these have negative values!
|
||||||
|
omit_cols3 = list(my_df.columns[my_df.columns.str.contains("electro_.*") | my_df.columns.str.contains("disulfide_.*") | my_df.columns.str.contains("hbonds_.*") | my_df.columns.str.contains("partcov_.*") | my_df.columns.str.contains("vdwclashes.*") | my_df.columns.str.contains("volumetric.*")])
|
||||||
|
|
||||||
|
omit_cols = omit_cols1 + omit_cols2 + omit_cols3
|
||||||
|
|
||||||
|
# Filter df: Filter columns to focus on my selected ones
|
||||||
|
my_df_filt = my_df.loc[:, ~my_df.columns.isin(omit_cols)]
|
||||||
|
my_df_filt_cols = my_df_filt.columns
|
||||||
|
|
||||||
|
#Fill na of filtered df: fill NaNs with column means/medians in each column
|
||||||
|
my_df_filt2 = my_df_filt.fillna(my_df_filt.mean())
|
||||||
|
my_df_filt3 = my_df_filt.fillna(my_df_filt.median())
|
||||||
|
#my_df_filt_noNA = my_df_filt.fillna(0)
|
||||||
|
|
||||||
|
summ = my_df_filt.describe()
|
||||||
|
summ2 = my_df_filt2.describe()
|
||||||
|
summ3 = my_df_filt3.describe()
|
||||||
|
#summ_noNA = my_df_filt_noNA.describe()
|
||||||
|
|
||||||
|
########################
|
||||||
|
# [WATCH]: Drop na
|
||||||
|
# Get Y
|
||||||
|
my_df2 = my_df_filt.dropna()
|
||||||
|
my_df2['resistance'] = my_df2['or_mychisq'].apply(lambda x: 0 if x <=1 else 1)
|
||||||
|
my_df2['resistance'].value_counts()
|
||||||
|
Y = my_df2['resistance']
|
||||||
|
Y = np.array(Y)
|
||||||
|
#Y = Y.reset_index()
|
||||||
|
#Y = Y.drop(['index'], axis = 1)
|
||||||
|
#Y.value_counts()
|
||||||
|
#Y = np.array(Y)
|
||||||
|
|
||||||
|
# GET X
|
||||||
|
omit_cols_y = ['or_mychisq', 'resistance']
|
||||||
|
my_df_ml = my_df2.loc[:, ~my_df2.columns.isin(omit_cols_y)]
|
||||||
|
#my_df_ml = my_df_ml.set_index('mutationinformation')
|
||||||
|
X = my_df_ml
|
||||||
|
X = X.drop(['mutationinformation'], axis = 1)
|
||||||
|
X = np.array(X)
|
||||||
|
|
||||||
|
#X = X.reset_index()
|
||||||
|
|
||||||
|
|
||||||
|
# check dim
|
||||||
|
X.shape
|
||||||
|
Y.shape
|
||||||
|
my_df2 = my_df2.reset_index()
|
||||||
|
|
||||||
|
#####################
|
||||||
|
#https://stackoverflow.com/questions/49134338/kfolds-cross-validation-vs-train-test-split
|
||||||
|
rf = RandomForestClassifier(n_estimators=100, random_state=42)
|
||||||
|
|
||||||
|
#https://towardsdatascience.com/stratified-k-fold-what-it-is-how-to-use-it-cf3d107d3ea2
|
||||||
|
# k-FOLD
|
||||||
|
print('Class Ratio:',
|
||||||
|
sum(Y)/len(Y))
|
||||||
|
|
||||||
|
print('Class Ratio:',
|
||||||
|
sum(my_df2['resistance'])/len(my_df2['resistance'])
|
||||||
|
)
|
||||||
|
|
||||||
|
target = my_df2.loc[:,'resistance']
|
||||||
|
|
||||||
|
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
|
||||||
|
fold_no = 1
|
||||||
|
for train_index, test_index in skf.split(my_df2, target):
|
||||||
|
train = my_df2.loc[train_index,:]
|
||||||
|
test = my_df2.loc[test_index,:]
|
||||||
|
print('Fold',str(fold_no),
|
||||||
|
'Class Ratio:',
|
||||||
|
sum(test['resistance'])/len(test['resistance']))
|
||||||
|
fold_no += 1
|
||||||
|
|
||||||
|
model_logisP = Pipeline(steps = [('preprocess', preprocessing.MinMaxScaler())
|
||||||
|
, ('logis', LogisticRegression(class_weight = 'unbalanced'))
|
||||||
|
])
|
||||||
|
|
||||||
|
X_features = X_train.columns.to_list()
|
||||||
|
|
||||||
|
def train_model(train, test, fold_no):
|
||||||
|
X = X_features
|
||||||
|
y = ['resistance']
|
||||||
|
X_train = train[X]
|
||||||
|
y_train = train[y]
|
||||||
|
X_test = test[X]
|
||||||
|
y_test = test[y]
|
||||||
|
model_logisP.fit(X_train,y_train)
|
||||||
|
predictions = model_logisP.predict(X_test)
|
||||||
|
print('Fold',str(fold_no),
|
||||||
|
'Accuracy:',
|
||||||
|
accuracy_score(y_test,predictions))
|
||||||
|
|
||||||
|
|
||||||
|
fold_no = 1
|
||||||
|
for train_index, test_index in skf.split(my_df2, target):
|
||||||
|
train = my_df2.loc[train_index,:]
|
||||||
|
test = my_df2.loc[test_index,:]
|
||||||
|
train_model(train,test,fold_no)
|
||||||
|
fold_no += 1
|
||||||
|
#%%
|
||||||
|
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=20)
|
||||||
|
lst_accu_stratified = []
|
||||||
|
scaler = preprocessing.MinMaxScaler()
|
||||||
|
X_scaled = scaler.fit_transform(X)
|
||||||
|
X_scaled = X_scaled[:,[1,2,3,15,16]]
|
||||||
|
|
||||||
|
#lr = linear_model.LogisticRegression(class_weight = 'unbalanced')
|
||||||
|
lr = linear_model.LogisticRegression()
|
||||||
|
|
||||||
|
for train_index1, test_index1 in skf.split(X, Y):
|
||||||
|
#print(train_index)
|
||||||
|
#print(test_index)
|
||||||
|
x_train_fold1, x_test_fold1 = X_scaled[train_index1], X_scaled[test_index1]
|
||||||
|
y_train_fold1, y_test_fold1 = Y[train_index1], Y[test_index1]
|
||||||
|
lr.fit(x_train_fold1, y_train_fold1)
|
||||||
|
lst_accu_stratified.append(lr.score(x_test_fold1, y_test_fold1))
|
||||||
|
|
||||||
|
# print output
|
||||||
|
print('List of possible accuracy', lst_accu_stratified)
|
||||||
|
print('Max accuracy:', max(lst_accu_stratified)*100, "%")
|
||||||
|
print('Min accuracy:', min(lst_accu_stratified)*100, "%")
|
||||||
|
print('Mean accuracy:', mean(lst_accu_stratified)*100,"%")
|
||||||
|
print('St Dev:', stdev(lst_accu_stratified)*100,"%")
|
||||||
|
|
||||||
|
|
||||||
|
# cancer data
|
||||||
|
cancer = datasets.load_breast_cancer()
|
||||||
|
x = cancer.data
|
||||||
|
y = cancer.target
|
||||||
|
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
|
||||||
|
lst_accu_stratifiedC = []
|
||||||
|
scaler = preprocessing.MinMaxScaler()
|
||||||
|
x_scaled = scaler.fit_transform(x)
|
||||||
|
x_scaled = x_scaled[:,[1,2,3, 15, 16]]
|
||||||
|
|
||||||
|
for train_index, test_index in skf.split(x, y):
|
||||||
|
#print(train_index)
|
||||||
|
#print(test_index)
|
||||||
|
x_train_fold, x_test_fold = x_scaled[train_index], x_scaled[test_index]
|
||||||
|
y_train_fold, y_test_fold = y[train_index], y[test_index]
|
||||||
|
lr.fit(x_train_fold, y_train_fold)
|
||||||
|
lst_accu_stratifiedC.append(lr.score(x_test_fold, y_test_fold))
|
||||||
|
|
||||||
|
# print output
|
||||||
|
print('List of possible accuracy', lst_accu_stratifiedC)
|
||||||
|
print('Max accuracy:', max(lst_accu_stratifiedC)*100, "%")
|
||||||
|
print('Min accuracy:', min(lst_accu_stratifiedC)*100, "%")
|
||||||
|
print('Mean accuracy:', mean(lst_accu_stratifiedC)*100,"%")
|
||||||
|
print('St Dev:', stdev(lst_accu_stratifiedC)*100,"%")
|
||||||
|
|
||||||
|
#%%
|
||||||
|
##
|
||||||
|
# https://towardsdatascience.com/my-random-forest-classifier-cheat-sheet-in-python-fedb84f8cf4f
|
||||||
|
y_all = my_df_filt['or_mychisq'].apply(lambda x: 0 if x <=1 else 1)
|
||||||
|
X_all = my_df_filt.drop(['mutationinformation', 'or_mychisq'], axis = 1)
|
||||||
|
seed = 20 # so that the result is reproducible
|
||||||
|
|
||||||
|
X_all = my_df_filt.drop(['mutationinformation', 'or_mychisq'], axis = 1)
|
||||||
|
X_all = X_all.iloc[:,:6]
|
||||||
|
|
||||||
|
X_train, X_test, y_train, y_test = train_test_split(X_all,y_all
|
||||||
|
, test_size=0.333
|
||||||
|
, random_state = seed)
|
||||||
|
# Now, it is time to make NA a category.
|
||||||
|
# In Python, NaN is considered NAs.
|
||||||
|
# When encoded, those NaN will be ignored.
|
||||||
|
# Hence, it is useful to replace NaN with na, which is now a category called ‘na’.
|
||||||
|
# This will be taken into account when encoding later on.
|
||||||
|
#X_train = X_train.fillna('na')
|
||||||
|
#X_test = X_test.fillna('na')
|
||||||
|
|
||||||
|
X_train = X_train.fillna(X_train.median())
|
||||||
|
X_test = X_test.fillna(X_test.median())
|
||||||
|
|
||||||
|
X_train.dtypes
|
||||||
|
|
||||||
|
features_to_encode = list(X_train.select_dtypes(include = ['object']).columns)
|
||||||
|
|
||||||
|
col_trans = make_column_transformer(
|
||||||
|
(OneHotEncoder(),features_to_encode),
|
||||||
|
remainder = "passthrough"
|
||||||
|
)
|
||||||
|
|
||||||
|
rf_classifier = RandomForestClassifier(
|
||||||
|
min_samples_leaf=50,
|
||||||
|
n_estimators=150,
|
||||||
|
bootstrap=True,
|
||||||
|
oob_score=True,
|
||||||
|
n_jobs=-1,
|
||||||
|
random_state=seed,
|
||||||
|
max_features='auto')
|
||||||
|
|
||||||
|
pipe = make_pipeline(col_trans, rf_classifier)
|
||||||
|
pipe.fit(X_train, y_train)
|
||||||
|
y_pred = pipe.predict(X_test)
|
||||||
|
|
||||||
|
accuracy_score(y_test, y_pred)
|
||||||
|
print(f"The accuracy of the model is {round(accuracy_score(y_test,y_pred),3)*100} %")
|
||||||
|
|
||||||
|
recall_score(y_test, y_pred)
|
||||||
|
precision_score(y_test, y_pred)
|
||||||
|
f1_score(y_test, y_pred)
|
||||||
|
roc_auc_score (y_test, y_pred)
|
||||||
|
roc_curve(y_test, y_pred)
|
||||||
|
|
||||||
|
train_probs = pipe.predict_proba(X_train)[:,1]
|
||||||
|
probs = pipe.predict_proba(X_test)[:, 1]
|
||||||
|
train_predictions = pipe.predict(X_train)
|
||||||
|
|
||||||
|
print(f'Train ROC AUC Score: {roc_auc_score(y_train, train_probs)}')
|
||||||
|
print(f'Test ROC AUC Score: {roc_auc_score(y_test, probs)}')
|
||||||
|
|
||||||
|
def evaluate_model(y_pred, probs,train_predictions, train_probs):
|
||||||
|
baseline = {}
|
||||||
|
baseline['recall']=recall_score(y_test,
|
||||||
|
[1 for _ in range(len(y_test))])
|
||||||
|
baseline['precision'] = precision_score(y_test,
|
||||||
|
[1 for _ in range(len(y_test))])
|
||||||
|
baseline['roc'] = 0.5
|
||||||
|
results = {}
|
||||||
|
results['recall'] = recall_score(y_test, y_pred)
|
||||||
|
results['precision'] = precision_score(y_test, y_pred)
|
||||||
|
results['roc'] = roc_auc_score(y_test, probs)
|
||||||
|
train_results = {}
|
||||||
|
train_results['recall'] = recall_score(y_train,
|
||||||
|
train_predictions)
|
||||||
|
train_results['precision'] = precision_score(y_train, train_predictions)
|
||||||
|
train_results['roc'] = roc_auc_score(y_train, train_probs)
|
||||||
|
# for metric in ['recall', 'precision', 'roc']:
|
||||||
|
# print(f"Baseline: {round(baseline[metric], 2)}Test: {round(results[metric], 2)} Train: {round(train_results[metric], 2)}")
|
||||||
|
|
||||||
|
# Calculate false positive rates and true positive rates
|
||||||
|
base_fpr, base_tpr, _ = roc_curve(y_test, [1 for _ in range(len(y_test))])
|
||||||
|
model_fpr, model_tpr, _ = roc_curve(y_test, probs)
|
||||||
|
plt.figure(figsize = (8, 6))
|
||||||
|
plt.rcParams['font.size'] = 16
|
||||||
|
# Plot both curves
|
||||||
|
plt.plot(base_fpr, base_tpr, 'b', label = 'baseline')
|
||||||
|
plt.plot(model_fpr, model_tpr, 'r', label = 'model')
|
||||||
|
plt.legend(); plt.xlabel('False Positive Rate');
|
||||||
|
plt.ylabel('True Positive Rate'); plt.title('ROC Curves');
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
# Recall Baseline: 1.0 Test: 0.92 Train: 0.93
|
||||||
|
# Precision Baseline: 0.48 Test: 0.9 Train: 0.91
|
||||||
|
# Roc Baseline: 0.5 Test: 0.97 Train: 0.97
|
||||||
|
|
||||||
|
evaluate_model(y_pred,probs,train_predictions,train_probs)
|
||||||
|
#%%
|
||||||
|
import itertools
|
||||||
|
def plot_confusion_matrix(cm, classes, normalize = False,
|
||||||
|
title='Confusion matrix',
|
||||||
|
cmap=plt.cm.Greens): # can change color
|
||||||
|
plt.figure(figsize = (10, 10))
|
||||||
|
plt.imshow(cm, interpolation='nearest', cmap=cmap)
|
||||||
|
plt.title(title, size = 24)
|
||||||
|
plt.colorbar(aspect=4)
|
||||||
|
tick_marks = np.arange(len(classes))
|
||||||
|
plt.xticks(tick_marks, classes, rotation=45, size = 14)
|
||||||
|
plt.yticks(tick_marks, classes, size = 14)
|
||||||
|
fmt = '.2f' if normalize else 'd'
|
||||||
|
thresh = cm.max() / 2.
|
||||||
|
# Label the plot
|
||||||
|
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): plt.text(j, i, format(cm[i, j], fmt),
|
||||||
|
fontsize = 20,
|
||||||
|
horizontalalignment="center",
|
||||||
|
color="white" if cm[i, j] > thresh else "black")
|
||||||
|
plt.grid(None)
|
||||||
|
plt.tight_layout()
|
||||||
|
plt.ylabel('True label', size = 18)
|
||||||
|
plt.xlabel('Predicted label', size = 18)
|
||||||
|
# Let's plot it out
|
||||||
|
cm = confusion_matrix(y_test, y_pred)
|
||||||
|
plot_confusion_matrix(cm, classes = ['0 - Susceptible', '1 - Resistant'],
|
||||||
|
title = 'R/S Confusion Matrix')
|
||||||
|
|
||||||
|
print(rf_classifier.feature_importances_)
|
||||||
|
print(f" There are {len(rf_classifier.feature_importances_)} features in total")
|
Loading…
Add table
Add a link
Reference in a new issue