added standard KFold as well
This commit is contained in:
parent
04e0267dd1
commit
1fecbc15c9
2 changed files with 128 additions and 61 deletions
188
my_datap4.py
188
my_datap4.py
|
@ -84,7 +84,13 @@ Y = my_df['resistance']
|
|||
#%%
|
||||
# GET X
|
||||
cols = my_df.columns
|
||||
X = my_df[['ligand_distance', 'ligand_affinity_change', 'duet_stability_change', 'ddg_foldx', 'deepddg', 'ddg_dynamut2', 'consurf_score']]
|
||||
X = my_df[['ligand_distance'
|
||||
, 'ligand_affinity_change'
|
||||
, 'duet_stability_change', 'ddg_foldx', 'deepddg', 'ddg_dynamut2', 'consurf_score'
|
||||
, 'snap2_score'
|
||||
#, 'snap2_accuracy_pc'
|
||||
, 'asa'
|
||||
, 'rsa']]
|
||||
|
||||
#%%
|
||||
####################################
|
||||
|
@ -215,79 +221,141 @@ print(f" There are {len(rf_classifier.feature_importances_)} features in total")
|
|||
#%%
|
||||
####################################
|
||||
# Model 2: case of stratified K-fold
|
||||
# Random forest
|
||||
# Logistic regression
|
||||
# MinMaxScaler
|
||||
# https://towardsdatascience.com/stratified-k-fold-what-it-is-how-to-use-it-cf3d107d3ea2
|
||||
# https://towardsdatascience.com/stratified-k-fold-what-it-is-how-to-use-it-cf3d107d3ea2 [ Didn't work!]
|
||||
# https://www.geeksforgeeks.org/stratified-k-fold-cross-validation/
|
||||
####################################
|
||||
print('Class Ratio:',
|
||||
sum(Y)/len(Y))
|
||||
|
||||
print('Class Ratio:',
|
||||
sum(my_df['resistance'])/len(my_df['resistance'])
|
||||
)
|
||||
sum(my_df['resistance'])/len(my_df['resistance']))
|
||||
|
||||
seed_skf = 50
|
||||
skf = StratifiedKFold(n_splits = 10
|
||||
skf = StratifiedKFold(n_splits = 10
|
||||
, shuffle = True
|
||||
, random_state = seed_skf)
|
||||
|
||||
target = my_df.loc[:,'resistance']
|
||||
lst_accu_stratified = []
|
||||
scaler = preprocessing.MinMaxScaler()
|
||||
X_scaled = scaler.fit_transform(X)
|
||||
#X_scaled = X_scaled[:,[1,2,3]]
|
||||
|
||||
df = my_df[['ligand_distance'
|
||||
, 'ligand_affinity_change'
|
||||
, 'duet_stability_change'
|
||||
, 'ddg_foldx'
|
||||
, 'deepddg'
|
||||
, 'ddg_dynamut2'
|
||||
, 'consurf_score'
|
||||
, 'resistance']]
|
||||
# To start with we’ll just split our data and print the class ratio for
|
||||
# each fold to check that they are all close to the full data set.
|
||||
# Test set contains a single fold so we use the test split to determine the
|
||||
# class ratio for each fold. You can see that each fold’s class ratio is close
|
||||
# to the full data set which is obviously what we want
|
||||
#lr = linear_model.LogisticRegression(class_weight = 'unbalanced')
|
||||
lr = linear_model.LogisticRegression()
|
||||
|
||||
fold_no = 1 # to label the folds for printing output
|
||||
for train_index, test_index in skf.split(df, target):
|
||||
train = df.loc[train_index,:]
|
||||
test = df.loc[test_index,:]
|
||||
print('Fold',str(fold_no)
|
||||
, 'Class Ratio:'
|
||||
, sum(test['resistance'])/len(test['resistance']))
|
||||
fold_no += 1
|
||||
for train_index, test_index in skf.split(X, Y):
|
||||
#print(train_index)
|
||||
#print(test_index)
|
||||
x_train_fold, x_test_fold = X_scaled[train_index], X_scaled[test_index]
|
||||
y_train_fold, y_test_fold = Y[train_index], Y[test_index]
|
||||
lr.fit(x_train_fold, y_train_fold)
|
||||
lst_accu_stratified.append(lr.score(x_test_fold, y_test_fold))
|
||||
|
||||
# print output
|
||||
print('List of possible accuracy', lst_accu_stratified)
|
||||
print('Max accuracy:', max(lst_accu_stratified)*100, "%")
|
||||
print('Min accuracy:', min(lst_accu_stratified)*100, "%")
|
||||
print('Mean accuracy:', mean(lst_accu_stratified)*100,"%")
|
||||
print('St Dev:', stdev(lst_accu_stratified)*100,"%")
|
||||
|
||||
#%%
|
||||
#--------------------------------------
|
||||
# Model2.1: same one but with pipeline
|
||||
# slightly different results when using
|
||||
# transformed or untransformed values!
|
||||
#--------------------------------------
|
||||
model_logisP = Pipeline(steps = [('preprocess', preprocessing.MinMaxScaler())
|
||||
, ('logis', LogisticRegression(class_weight = 'balanced'))
|
||||
])
|
||||
model = LogisticRegression()
|
||||
# Next we’ll build a custom function that we can pass our data splits to for
|
||||
# training and testing.
|
||||
def train_model(train, test, fold_no):
|
||||
X = my_df[['ligand_distance'
|
||||
, 'ligand_affinity_change'
|
||||
, 'duet_stability_change'
|
||||
, 'ddg_foldx'
|
||||
, 'deepddg'
|
||||
, 'ddg_dynamut2'
|
||||
, 'consurf_score']]
|
||||
y = my_df.loc[:,'resistance']
|
||||
X_train = train[X]
|
||||
y_train = train[y]
|
||||
X_test = test[X]
|
||||
y_test = test[y]
|
||||
model.fit(X_train,y_train)
|
||||
predictions = model.predict(X_test)
|
||||
print('Fold',str(fold_no),
|
||||
'Accuracy:',
|
||||
accuracy_score(y_test,predictions))
|
||||
, ('logis', LogisticRegression(class_weight = 'unbalanced')) ]) # changes stdev
|
||||
seed_skf = 50
|
||||
skf = StratifiedKFold(n_splits = 10
|
||||
, shuffle = True
|
||||
, random_state = seed_skf)
|
||||
|
||||
# Finally, let’s modify the for loop we created above to call the build_model
|
||||
# function on each of our splits.
|
||||
X_array = np.array(X)
|
||||
lst_accu_stratified = []
|
||||
for train_index, test_index in skf.split(X_array, Y):
|
||||
x_train_fold, x_test_fold = X_array[train_index], X_array[test_index]
|
||||
y_train_fold, y_test_fold = Y[train_index], Y[test_index]
|
||||
model_logisP.fit(x_train_fold, y_train_fold)
|
||||
lst_accu_stratified.append(model_logisP.score(x_test_fold, y_test_fold))
|
||||
|
||||
fold_no = 1
|
||||
for train_index, test_index in skf.split(df, target):
|
||||
train = df.loc[train_index,:]
|
||||
test = df.loc[test_index,:]
|
||||
train_model(train,test,fold_no)
|
||||
fold_no += 1
|
||||
|
||||
# print output
|
||||
print('List of possible accuracy', lst_accu_stratified)
|
||||
print('Max accuracy:', max(lst_accu_stratified)*100, "%")
|
||||
print('Min accuracy:', min(lst_accu_stratified)*100, "%")
|
||||
print('Mean accuracy:', mean(lst_accu_stratified)*100,"%")
|
||||
print('St Dev:', stdev(lst_accu_stratified)*100,"%")
|
||||
|
||||
####################################
|
||||
# Model 3: stratified K-fold
|
||||
# Random forest
|
||||
# MinMaxScaler
|
||||
# X: needs to be an array for str Kfold
|
||||
####################################
|
||||
|
||||
model_rf = Pipeline(steps = [('preprocess', preprocessing.MinMaxScaler())
|
||||
, ('rf' , RandomForestClassifier(n_estimators=100, random_state=42))])
|
||||
seed_skf = 50
|
||||
skf = StratifiedKFold(n_splits = 10
|
||||
, shuffle = True
|
||||
, random_state = seed_skf)
|
||||
|
||||
X_array = np.array(X)
|
||||
lst_accu_stratified_rf = []
|
||||
for train_index, test_index in skf.split(X_array, Y):
|
||||
x_train_fold, x_test_fold = X_array[train_index], X_array[test_index]
|
||||
y_train_fold, y_test_fold = Y[train_index], Y[test_index]
|
||||
model_rf.fit(x_train_fold, y_train_fold)
|
||||
lst_accu_stratified_rf.append(model_rf.score(x_test_fold, y_test_fold))
|
||||
|
||||
# print output
|
||||
print('List of possible accuracy', lst_accu_stratified_rf)
|
||||
print('Max accuracy:', max(lst_accu_stratified_rf)*100, "%")
|
||||
print('Min accuracy:', min(lst_accu_stratified_rf)*100, "%")
|
||||
print('Mean accuracy:', mean(lst_accu_stratified_rf)*100,"%")
|
||||
print('St Dev:', stdev(lst_accu_stratified_rf)*100,"%")
|
||||
|
||||
####################################
|
||||
# Model 4: Cross validate K-fold
|
||||
# Random forest
|
||||
# MinMaxScaler
|
||||
# X: needs to be an array for Kfold
|
||||
# FIXME: DOESNT WORK BECAUSE MSE is for LR, not Logistic or random?
|
||||
####################################
|
||||
from sklearn.metrics import mean_squared_error, make_scorer
|
||||
from sklearn.model_selection import cross_validate
|
||||
|
||||
score_fn = make_scorer(mean_squared_error)
|
||||
scores = cross_validate(model_rf, X_train, y_train
|
||||
, scoring = score_fn
|
||||
, cv = 10)
|
||||
|
||||
from itertools import combinations
|
||||
def train(X):
|
||||
return cross_validate(model_rf, X, y_train
|
||||
, scoring = score_fn
|
||||
, cv = 10
|
||||
, return_estimator = True)['test_score']
|
||||
scores = [train(X_train.loc[:,vars]) for vars in combinations(X_train.columns,11)]
|
||||
means = [score.mean() for score in scores]
|
||||
#%%
|
||||
# https://stackoverflow.com/questions/52316237/finding-logistic-regression-weights-from-k-fold-cv
|
||||
from sklearn.linear_model import LogisticRegressionCV
|
||||
from sklearn.model_selection import KFold
|
||||
kf = KFold(n_splits=10, shuffle=True, random_state=42)
|
||||
logistic = LogisticRegressionCV(Cs=2, fit_intercept=True, cv=kf, verbose =1, random_state=42)
|
||||
logistic.fit(X_train, y_train)
|
||||
print("Train Coefficient:" , logistic.coef_) #weights of each feature
|
||||
print("Train Intercept:" , logistic.intercept_) #value of intercept
|
||||
#%%
|
||||
# https://machinelearningmastery.com/repeated-k-fold-cross-validation-with-python/
|
||||
from sklearn.model_selection import cross_val_score
|
||||
from numpy import std
|
||||
cv = KFold(n_splits=10, random_state=1, shuffle=True)
|
||||
scores = cross_val_score(model_rf, X,Y, scoring='accuracy', cv=cv, n_jobs=-1)
|
||||
scores2 = cross_val_score(model_logisP, X, Y, scoring='accuracy', cv=cv, n_jobs=-1)
|
||||
# report performance
|
||||
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))
|
||||
print('Accuracy: %.3f (%.3f)' % (mean(scores2), stdev(scores2)))
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue