From 1fecbc15c983d0ba100afeca5d422c0dc1e943f2 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Thu, 3 Mar 2022 15:18:34 +0000 Subject: [PATCH] added standard KFold as well --- ml_data/.~lock.data_colnames.ods# | 1 - my_datap4.py | 188 ++++++++++++++++++++---------- 2 files changed, 128 insertions(+), 61 deletions(-) delete mode 100644 ml_data/.~lock.data_colnames.ods# diff --git a/ml_data/.~lock.data_colnames.ods# b/ml_data/.~lock.data_colnames.ods# deleted file mode 100644 index 17d42db..0000000 --- a/ml_data/.~lock.data_colnames.ods# +++ /dev/null @@ -1 +0,0 @@ -Tanushree Tunstall,tanu,panino.tunstall.in,03.03.2022 11:47,file:///home/tanu/.config/libreoffice/4; \ No newline at end of file diff --git a/my_datap4.py b/my_datap4.py index eac2d39..f271d21 100644 --- a/my_datap4.py +++ b/my_datap4.py @@ -84,7 +84,13 @@ Y = my_df['resistance'] #%% # GET X cols = my_df.columns -X = my_df[['ligand_distance', 'ligand_affinity_change', 'duet_stability_change', 'ddg_foldx', 'deepddg', 'ddg_dynamut2', 'consurf_score']] +X = my_df[['ligand_distance' + , 'ligand_affinity_change' + , 'duet_stability_change', 'ddg_foldx', 'deepddg', 'ddg_dynamut2', 'consurf_score' + , 'snap2_score' + #, 'snap2_accuracy_pc' + , 'asa' + , 'rsa']] #%% #################################### @@ -215,79 +221,141 @@ print(f" There are {len(rf_classifier.feature_importances_)} features in total") #%% #################################### # Model 2: case of stratified K-fold -# Random forest +# Logistic regression # MinMaxScaler -# https://towardsdatascience.com/stratified-k-fold-what-it-is-how-to-use-it-cf3d107d3ea2 +# https://towardsdatascience.com/stratified-k-fold-what-it-is-how-to-use-it-cf3d107d3ea2 [ Didn't work!] +# https://www.geeksforgeeks.org/stratified-k-fold-cross-validation/ #################################### print('Class Ratio:', sum(Y)/len(Y)) print('Class Ratio:', - sum(my_df['resistance'])/len(my_df['resistance']) - ) + sum(my_df['resistance'])/len(my_df['resistance'])) seed_skf = 50 -skf = StratifiedKFold(n_splits = 10 +skf = StratifiedKFold(n_splits = 10 , shuffle = True , random_state = seed_skf) -target = my_df.loc[:,'resistance'] +lst_accu_stratified = [] +scaler = preprocessing.MinMaxScaler() +X_scaled = scaler.fit_transform(X) +#X_scaled = X_scaled[:,[1,2,3]] -df = my_df[['ligand_distance' - , 'ligand_affinity_change' - , 'duet_stability_change' - , 'ddg_foldx' - , 'deepddg' - , 'ddg_dynamut2' - , 'consurf_score' - , 'resistance']] -# To start with we’ll just split our data and print the class ratio for -# each fold to check that they are all close to the full data set. -# Test set contains a single fold so we use the test split to determine the -# class ratio for each fold. You can see that each fold’s class ratio is close -# to the full data set which is obviously what we want +#lr = linear_model.LogisticRegression(class_weight = 'unbalanced') +lr = linear_model.LogisticRegression() -fold_no = 1 # to label the folds for printing output -for train_index, test_index in skf.split(df, target): - train = df.loc[train_index,:] - test = df.loc[test_index,:] - print('Fold',str(fold_no) - , 'Class Ratio:' - , sum(test['resistance'])/len(test['resistance'])) - fold_no += 1 +for train_index, test_index in skf.split(X, Y): + #print(train_index) + #print(test_index) + x_train_fold, x_test_fold = X_scaled[train_index], X_scaled[test_index] + y_train_fold, y_test_fold = Y[train_index], Y[test_index] + lr.fit(x_train_fold, y_train_fold) + lst_accu_stratified.append(lr.score(x_test_fold, y_test_fold)) +# print output +print('List of possible accuracy', lst_accu_stratified) +print('Max accuracy:', max(lst_accu_stratified)*100, "%") +print('Min accuracy:', min(lst_accu_stratified)*100, "%") +print('Mean accuracy:', mean(lst_accu_stratified)*100,"%") +print('St Dev:', stdev(lst_accu_stratified)*100,"%") + +#%% +#-------------------------------------- +# Model2.1: same one but with pipeline +# slightly different results when using +# transformed or untransformed values! +#-------------------------------------- model_logisP = Pipeline(steps = [('preprocess', preprocessing.MinMaxScaler()) - , ('logis', LogisticRegression(class_weight = 'balanced')) - ]) -model = LogisticRegression() -# Next we’ll build a custom function that we can pass our data splits to for -# training and testing. -def train_model(train, test, fold_no): - X = my_df[['ligand_distance' - , 'ligand_affinity_change' - , 'duet_stability_change' - , 'ddg_foldx' - , 'deepddg' - , 'ddg_dynamut2' - , 'consurf_score']] - y = my_df.loc[:,'resistance'] - X_train = train[X] - y_train = train[y] - X_test = test[X] - y_test = test[y] - model.fit(X_train,y_train) - predictions = model.predict(X_test) - print('Fold',str(fold_no), - 'Accuracy:', - accuracy_score(y_test,predictions)) + , ('logis', LogisticRegression(class_weight = 'unbalanced')) ]) # changes stdev +seed_skf = 50 +skf = StratifiedKFold(n_splits = 10 + , shuffle = True + , random_state = seed_skf) -# Finally, let’s modify the for loop we created above to call the build_model -# function on each of our splits. +X_array = np.array(X) +lst_accu_stratified = [] +for train_index, test_index in skf.split(X_array, Y): + x_train_fold, x_test_fold = X_array[train_index], X_array[test_index] + y_train_fold, y_test_fold = Y[train_index], Y[test_index] + model_logisP.fit(x_train_fold, y_train_fold) + lst_accu_stratified.append(model_logisP.score(x_test_fold, y_test_fold)) -fold_no = 1 -for train_index, test_index in skf.split(df, target): - train = df.loc[train_index,:] - test = df.loc[test_index,:] - train_model(train,test,fold_no) - fold_no += 1 - \ No newline at end of file +# print output +print('List of possible accuracy', lst_accu_stratified) +print('Max accuracy:', max(lst_accu_stratified)*100, "%") +print('Min accuracy:', min(lst_accu_stratified)*100, "%") +print('Mean accuracy:', mean(lst_accu_stratified)*100,"%") +print('St Dev:', stdev(lst_accu_stratified)*100,"%") + +#################################### +# Model 3: stratified K-fold +# Random forest +# MinMaxScaler +# X: needs to be an array for str Kfold +#################################### + +model_rf = Pipeline(steps = [('preprocess', preprocessing.MinMaxScaler()) + , ('rf' , RandomForestClassifier(n_estimators=100, random_state=42))]) +seed_skf = 50 +skf = StratifiedKFold(n_splits = 10 + , shuffle = True + , random_state = seed_skf) + +X_array = np.array(X) +lst_accu_stratified_rf = [] +for train_index, test_index in skf.split(X_array, Y): + x_train_fold, x_test_fold = X_array[train_index], X_array[test_index] + y_train_fold, y_test_fold = Y[train_index], Y[test_index] + model_rf.fit(x_train_fold, y_train_fold) + lst_accu_stratified_rf.append(model_rf.score(x_test_fold, y_test_fold)) + +# print output +print('List of possible accuracy', lst_accu_stratified_rf) +print('Max accuracy:', max(lst_accu_stratified_rf)*100, "%") +print('Min accuracy:', min(lst_accu_stratified_rf)*100, "%") +print('Mean accuracy:', mean(lst_accu_stratified_rf)*100,"%") +print('St Dev:', stdev(lst_accu_stratified_rf)*100,"%") + +#################################### +# Model 4: Cross validate K-fold +# Random forest +# MinMaxScaler +# X: needs to be an array for Kfold +# FIXME: DOESNT WORK BECAUSE MSE is for LR, not Logistic or random? +#################################### +from sklearn.metrics import mean_squared_error, make_scorer +from sklearn.model_selection import cross_validate + +score_fn = make_scorer(mean_squared_error) +scores = cross_validate(model_rf, X_train, y_train + , scoring = score_fn + , cv = 10) + +from itertools import combinations +def train(X): + return cross_validate(model_rf, X, y_train + , scoring = score_fn + , cv = 10 + , return_estimator = True)['test_score'] +scores = [train(X_train.loc[:,vars]) for vars in combinations(X_train.columns,11)] +means = [score.mean() for score in scores] +#%% +# https://stackoverflow.com/questions/52316237/finding-logistic-regression-weights-from-k-fold-cv +from sklearn.linear_model import LogisticRegressionCV +from sklearn.model_selection import KFold +kf = KFold(n_splits=10, shuffle=True, random_state=42) +logistic = LogisticRegressionCV(Cs=2, fit_intercept=True, cv=kf, verbose =1, random_state=42) +logistic.fit(X_train, y_train) +print("Train Coefficient:" , logistic.coef_) #weights of each feature +print("Train Intercept:" , logistic.intercept_) #value of intercept +#%% +# https://machinelearningmastery.com/repeated-k-fold-cross-validation-with-python/ +from sklearn.model_selection import cross_val_score +from numpy import std +cv = KFold(n_splits=10, random_state=1, shuffle=True) +scores = cross_val_score(model_rf, X,Y, scoring='accuracy', cv=cv, n_jobs=-1) +scores2 = cross_val_score(model_logisP, X, Y, scoring='accuracy', cv=cv, n_jobs=-1) +# report performance +print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores))) +print('Accuracy: %.3f (%.3f)' % (mean(scores2), stdev(scores2)))