added standard KFold as well

This commit is contained in:
Tanushree Tunstall 2022-03-03 15:18:34 +00:00
parent 04e0267dd1
commit 1fecbc15c9
2 changed files with 128 additions and 61 deletions

View file

@ -1 +0,0 @@
Tanushree Tunstall,tanu,panino.tunstall.in,03.03.2022 11:47,file:///home/tanu/.config/libreoffice/4;

View file

@ -84,7 +84,13 @@ Y = my_df['resistance']
#%%
# GET X
cols = my_df.columns
X = my_df[['ligand_distance', 'ligand_affinity_change', 'duet_stability_change', 'ddg_foldx', 'deepddg', 'ddg_dynamut2', 'consurf_score']]
X = my_df[['ligand_distance'
, 'ligand_affinity_change'
, 'duet_stability_change', 'ddg_foldx', 'deepddg', 'ddg_dynamut2', 'consurf_score'
, 'snap2_score'
#, 'snap2_accuracy_pc'
, 'asa'
, 'rsa']]
#%%
####################################
@ -215,79 +221,141 @@ print(f" There are {len(rf_classifier.feature_importances_)} features in total")
#%%
####################################
# Model 2: case of stratified K-fold
# Random forest
# Logistic regression
# MinMaxScaler
# https://towardsdatascience.com/stratified-k-fold-what-it-is-how-to-use-it-cf3d107d3ea2
# https://towardsdatascience.com/stratified-k-fold-what-it-is-how-to-use-it-cf3d107d3ea2 [ Didn't work!]
# https://www.geeksforgeeks.org/stratified-k-fold-cross-validation/
####################################
print('Class Ratio:',
sum(Y)/len(Y))
print('Class Ratio:',
sum(my_df['resistance'])/len(my_df['resistance'])
)
sum(my_df['resistance'])/len(my_df['resistance']))
seed_skf = 50
skf = StratifiedKFold(n_splits = 10
, shuffle = True
, random_state = seed_skf)
target = my_df.loc[:,'resistance']
lst_accu_stratified = []
scaler = preprocessing.MinMaxScaler()
X_scaled = scaler.fit_transform(X)
#X_scaled = X_scaled[:,[1,2,3]]
df = my_df[['ligand_distance'
, 'ligand_affinity_change'
, 'duet_stability_change'
, 'ddg_foldx'
, 'deepddg'
, 'ddg_dynamut2'
, 'consurf_score'
, 'resistance']]
# To start with well just split our data and print the class ratio for
# each fold to check that they are all close to the full data set.
# Test set contains a single fold so we use the test split to determine the
# class ratio for each fold. You can see that each folds class ratio is close
# to the full data set which is obviously what we want
#lr = linear_model.LogisticRegression(class_weight = 'unbalanced')
lr = linear_model.LogisticRegression()
fold_no = 1 # to label the folds for printing output
for train_index, test_index in skf.split(df, target):
train = df.loc[train_index,:]
test = df.loc[test_index,:]
print('Fold',str(fold_no)
, 'Class Ratio:'
, sum(test['resistance'])/len(test['resistance']))
fold_no += 1
for train_index, test_index in skf.split(X, Y):
#print(train_index)
#print(test_index)
x_train_fold, x_test_fold = X_scaled[train_index], X_scaled[test_index]
y_train_fold, y_test_fold = Y[train_index], Y[test_index]
lr.fit(x_train_fold, y_train_fold)
lst_accu_stratified.append(lr.score(x_test_fold, y_test_fold))
# print output
print('List of possible accuracy', lst_accu_stratified)
print('Max accuracy:', max(lst_accu_stratified)*100, "%")
print('Min accuracy:', min(lst_accu_stratified)*100, "%")
print('Mean accuracy:', mean(lst_accu_stratified)*100,"%")
print('St Dev:', stdev(lst_accu_stratified)*100,"%")
#%%
#--------------------------------------
# Model2.1: same one but with pipeline
# slightly different results when using
# transformed or untransformed values!
#--------------------------------------
model_logisP = Pipeline(steps = [('preprocess', preprocessing.MinMaxScaler())
, ('logis', LogisticRegression(class_weight = 'balanced'))
])
model = LogisticRegression()
# Next well build a custom function that we can pass our data splits to for
# training and testing.
def train_model(train, test, fold_no):
X = my_df[['ligand_distance'
, 'ligand_affinity_change'
, 'duet_stability_change'
, 'ddg_foldx'
, 'deepddg'
, 'ddg_dynamut2'
, 'consurf_score']]
y = my_df.loc[:,'resistance']
X_train = train[X]
y_train = train[y]
X_test = test[X]
y_test = test[y]
model.fit(X_train,y_train)
predictions = model.predict(X_test)
print('Fold',str(fold_no),
'Accuracy:',
accuracy_score(y_test,predictions))
, ('logis', LogisticRegression(class_weight = 'unbalanced')) ]) # changes stdev
seed_skf = 50
skf = StratifiedKFold(n_splits = 10
, shuffle = True
, random_state = seed_skf)
# Finally, lets modify the for loop we created above to call the build_model
# function on each of our splits.
X_array = np.array(X)
lst_accu_stratified = []
for train_index, test_index in skf.split(X_array, Y):
x_train_fold, x_test_fold = X_array[train_index], X_array[test_index]
y_train_fold, y_test_fold = Y[train_index], Y[test_index]
model_logisP.fit(x_train_fold, y_train_fold)
lst_accu_stratified.append(model_logisP.score(x_test_fold, y_test_fold))
fold_no = 1
for train_index, test_index in skf.split(df, target):
train = df.loc[train_index,:]
test = df.loc[test_index,:]
train_model(train,test,fold_no)
fold_no += 1
# print output
print('List of possible accuracy', lst_accu_stratified)
print('Max accuracy:', max(lst_accu_stratified)*100, "%")
print('Min accuracy:', min(lst_accu_stratified)*100, "%")
print('Mean accuracy:', mean(lst_accu_stratified)*100,"%")
print('St Dev:', stdev(lst_accu_stratified)*100,"%")
####################################
# Model 3: stratified K-fold
# Random forest
# MinMaxScaler
# X: needs to be an array for str Kfold
####################################
model_rf = Pipeline(steps = [('preprocess', preprocessing.MinMaxScaler())
, ('rf' , RandomForestClassifier(n_estimators=100, random_state=42))])
seed_skf = 50
skf = StratifiedKFold(n_splits = 10
, shuffle = True
, random_state = seed_skf)
X_array = np.array(X)
lst_accu_stratified_rf = []
for train_index, test_index in skf.split(X_array, Y):
x_train_fold, x_test_fold = X_array[train_index], X_array[test_index]
y_train_fold, y_test_fold = Y[train_index], Y[test_index]
model_rf.fit(x_train_fold, y_train_fold)
lst_accu_stratified_rf.append(model_rf.score(x_test_fold, y_test_fold))
# print output
print('List of possible accuracy', lst_accu_stratified_rf)
print('Max accuracy:', max(lst_accu_stratified_rf)*100, "%")
print('Min accuracy:', min(lst_accu_stratified_rf)*100, "%")
print('Mean accuracy:', mean(lst_accu_stratified_rf)*100,"%")
print('St Dev:', stdev(lst_accu_stratified_rf)*100,"%")
####################################
# Model 4: Cross validate K-fold
# Random forest
# MinMaxScaler
# X: needs to be an array for Kfold
# FIXME: DOESNT WORK BECAUSE MSE is for LR, not Logistic or random?
####################################
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import cross_validate
score_fn = make_scorer(mean_squared_error)
scores = cross_validate(model_rf, X_train, y_train
, scoring = score_fn
, cv = 10)
from itertools import combinations
def train(X):
return cross_validate(model_rf, X, y_train
, scoring = score_fn
, cv = 10
, return_estimator = True)['test_score']
scores = [train(X_train.loc[:,vars]) for vars in combinations(X_train.columns,11)]
means = [score.mean() for score in scores]
#%%
# https://stackoverflow.com/questions/52316237/finding-logistic-regression-weights-from-k-fold-cv
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import KFold
kf = KFold(n_splits=10, shuffle=True, random_state=42)
logistic = LogisticRegressionCV(Cs=2, fit_intercept=True, cv=kf, verbose =1, random_state=42)
logistic.fit(X_train, y_train)
print("Train Coefficient:" , logistic.coef_) #weights of each feature
print("Train Intercept:" , logistic.intercept_) #value of intercept
#%%
# https://machinelearningmastery.com/repeated-k-fold-cross-validation-with-python/
from sklearn.model_selection import cross_val_score
from numpy import std
cv = KFold(n_splits=10, random_state=1, shuffle=True)
scores = cross_val_score(model_rf, X,Y, scoring='accuracy', cv=cv, n_jobs=-1)
scores2 = cross_val_score(model_logisP, X, Y, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))
print('Accuracy: %.3f (%.3f)' % (mean(scores2), stdev(scores2)))