From 1fecbc15c983d0ba100afeca5d422c0dc1e943f2 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Thu, 3 Mar 2022 15:18:34 +0000
Subject: [PATCH] added standard KFold as well

---
 ml_data/.~lock.data_colnames.ods# |   1 -
 my_datap4.py                      | 188 ++++++++++++++++++++----------
 2 files changed, 128 insertions(+), 61 deletions(-)
 delete mode 100644 ml_data/.~lock.data_colnames.ods#

diff --git a/ml_data/.~lock.data_colnames.ods# b/ml_data/.~lock.data_colnames.ods#
deleted file mode 100644
index 17d42db..0000000
--- a/ml_data/.~lock.data_colnames.ods#
+++ /dev/null
@@ -1 +0,0 @@
-Tanushree Tunstall,tanu,panino.tunstall.in,03.03.2022 11:47,file:///home/tanu/.config/libreoffice/4;
\ No newline at end of file
diff --git a/my_datap4.py b/my_datap4.py
index eac2d39..f271d21 100644
--- a/my_datap4.py
+++ b/my_datap4.py
@@ -84,7 +84,13 @@ Y = my_df['resistance']
 #%%
 # GET X
 cols = my_df.columns
-X = my_df[['ligand_distance', 'ligand_affinity_change', 'duet_stability_change', 'ddg_foldx', 'deepddg', 'ddg_dynamut2', 'consurf_score']]
+X = my_df[['ligand_distance'
+           , 'ligand_affinity_change'
+           , 'duet_stability_change', 'ddg_foldx', 'deepddg', 'ddg_dynamut2', 'consurf_score'
+           , 'snap2_score'
+           #, 'snap2_accuracy_pc'
+           , 'asa'
+           , 'rsa']]
 
 #%%
 ####################################
@@ -215,79 +221,141 @@ print(f" There are {len(rf_classifier.feature_importances_)} features in total")
 #%%
 ####################################
 # Model 2: case of stratified K-fold
-# Random forest
+# Logistic regression
 # MinMaxScaler
-# https://towardsdatascience.com/stratified-k-fold-what-it-is-how-to-use-it-cf3d107d3ea2
+# https://towardsdatascience.com/stratified-k-fold-what-it-is-how-to-use-it-cf3d107d3ea2 [ Didn't work!]
+# https://www.geeksforgeeks.org/stratified-k-fold-cross-validation/
 ####################################
 print('Class Ratio:',
        sum(Y)/len(Y))
 
 print('Class Ratio:',
-       sum(my_df['resistance'])/len(my_df['resistance'])
-       )
+       sum(my_df['resistance'])/len(my_df['resistance']))
 
 seed_skf = 50
-skf = StratifiedKFold(n_splits  = 10
+skf = StratifiedKFold(n_splits = 10
                       , shuffle = True
                       , random_state = seed_skf)
 
-target = my_df.loc[:,'resistance']
+lst_accu_stratified = []
+scaler = preprocessing.MinMaxScaler()
+X_scaled = scaler.fit_transform(X)
+#X_scaled = X_scaled[:,[1,2,3]]
 
-df = my_df[['ligand_distance'
-            , 'ligand_affinity_change'
-            , 'duet_stability_change'
-            , 'ddg_foldx'
-            , 'deepddg'
-            , 'ddg_dynamut2'
-            , 'consurf_score'
-            , 'resistance']]
-# To start with we’ll just split our data and print the class ratio for
-# each fold to check that they are all close to the full data set.
-# Test set contains a single fold so we use the test split to determine the 
-# class ratio for each fold. You can see that each fold’s class ratio is close
-# to the full data set which is obviously what we want
+#lr = linear_model.LogisticRegression(class_weight = 'unbalanced')
+lr = linear_model.LogisticRegression()
 
-fold_no = 1 # to label the folds for printing output
-for train_index, test_index in skf.split(df, target):
-    train = df.loc[train_index,:]
-    test = df.loc[test_index,:]
-    print('Fold',str(fold_no)
-          , 'Class Ratio:'
-          , sum(test['resistance'])/len(test['resistance']))
-    fold_no += 1
+for train_index, test_index in skf.split(X, Y):
+    #print(train_index)
+    #print(test_index)
+    x_train_fold, x_test_fold = X_scaled[train_index], X_scaled[test_index]
+    y_train_fold, y_test_fold = Y[train_index], Y[test_index]
+    lr.fit(x_train_fold, y_train_fold)
+    lst_accu_stratified.append(lr.score(x_test_fold, y_test_fold))
 
+# print output
+print('List of possible accuracy', lst_accu_stratified)
+print('Max accuracy:', max(lst_accu_stratified)*100, "%")
+print('Min accuracy:', min(lst_accu_stratified)*100, "%")
+print('Mean accuracy:', mean(lst_accu_stratified)*100,"%")
+print('St Dev:', stdev(lst_accu_stratified)*100,"%")    
+
+#%%
+#--------------------------------------
+# Model2.1: same one but with pipeline
+# slightly different results when using 
+# transformed or untransformed values!
+#--------------------------------------
 model_logisP = Pipeline(steps = [('preprocess', preprocessing.MinMaxScaler())
-                              , ('logis', LogisticRegression(class_weight = 'balanced'))
-                          ])
-model = LogisticRegression()
-# Next we’ll build a custom function that we can pass our data splits to for 
-# training and testing.
-def train_model(train, test, fold_no):
-   X = my_df[['ligand_distance'
-               , 'ligand_affinity_change'
-               , 'duet_stability_change'
-               , 'ddg_foldx'
-               , 'deepddg'
-               , 'ddg_dynamut2'
-               , 'consurf_score']]
-   y = my_df.loc[:,'resistance']
-   X_train = train[X]
-   y_train = train[y]
-   X_test = test[X]
-   y_test = test[y]
-   model.fit(X_train,y_train)
-   predictions = model.predict(X_test)
-   print('Fold',str(fold_no),
-         'Accuracy:',
-         accuracy_score(y_test,predictions))
+                              , ('logis', LogisticRegression(class_weight = 'unbalanced')) ]) # changes stdev
+seed_skf = 50
+skf = StratifiedKFold(n_splits = 10
+                      , shuffle = True
+                      , random_state = seed_skf)
 
-# Finally, let’s modify the for loop we created above to call the build_model 
-# function on each of our splits.
+X_array = np.array(X)
+lst_accu_stratified = []
+for train_index, test_index in skf.split(X_array, Y):
+    x_train_fold, x_test_fold = X_array[train_index], X_array[test_index]
+    y_train_fold, y_test_fold = Y[train_index], Y[test_index]
+    model_logisP.fit(x_train_fold, y_train_fold)
+    lst_accu_stratified.append(model_logisP.score(x_test_fold, y_test_fold))
 
-fold_no = 1
-for train_index, test_index in skf.split(df, target):
-    train = df.loc[train_index,:]
-    test = df.loc[test_index,:]
-    train_model(train,test,fold_no)
-    fold_no += 1
-    
\ No newline at end of file
+# print output
+print('List of possible accuracy', lst_accu_stratified)
+print('Max accuracy:', max(lst_accu_stratified)*100, "%")
+print('Min accuracy:', min(lst_accu_stratified)*100, "%")
+print('Mean accuracy:', mean(lst_accu_stratified)*100,"%")
+print('St Dev:', stdev(lst_accu_stratified)*100,"%")    
+
+####################################
+# Model 3: stratified K-fold
+# Random forest
+# MinMaxScaler
+# X: needs to be an array for str Kfold
+####################################
+
+model_rf = Pipeline(steps = [('preprocess', preprocessing.MinMaxScaler())
+                              , ('rf'     , RandomForestClassifier(n_estimators=100, random_state=42))])
+seed_skf = 50
+skf = StratifiedKFold(n_splits = 10
+                      , shuffle = True
+                      , random_state = seed_skf)
+
+X_array = np.array(X)
+lst_accu_stratified_rf = []
+for train_index, test_index in skf.split(X_array, Y):
+    x_train_fold, x_test_fold = X_array[train_index], X_array[test_index]
+    y_train_fold, y_test_fold = Y[train_index], Y[test_index]
+    model_rf.fit(x_train_fold, y_train_fold)
+    lst_accu_stratified_rf.append(model_rf.score(x_test_fold, y_test_fold))
+
+# print output
+print('List of possible accuracy', lst_accu_stratified_rf)
+print('Max accuracy:', max(lst_accu_stratified_rf)*100, "%")
+print('Min accuracy:', min(lst_accu_stratified_rf)*100, "%")
+print('Mean accuracy:', mean(lst_accu_stratified_rf)*100,"%")
+print('St Dev:', stdev(lst_accu_stratified_rf)*100,"%")    
+
+####################################
+# Model 4: Cross validate K-fold
+# Random forest
+# MinMaxScaler
+# X: needs to be an array for Kfold
+# FIXME: DOESNT WORK BECAUSE MSE is for LR, not Logistic or random?
+####################################
+from sklearn.metrics import mean_squared_error, make_scorer
+from sklearn.model_selection import cross_validate
+
+score_fn = make_scorer(mean_squared_error)
+scores = cross_validate(model_rf, X_train, y_train
+                        , scoring = score_fn
+                        , cv = 10)
+
+from itertools import combinations
+def train(X):
+    return cross_validate(model_rf, X, y_train
+                          , scoring = score_fn 
+                          , cv = 10
+                        , return_estimator = True)['test_score']
+scores = [train(X_train.loc[:,vars]) for vars in combinations(X_train.columns,11)]
+means = [score.mean() for score in scores]
+#%%
+# https://stackoverflow.com/questions/52316237/finding-logistic-regression-weights-from-k-fold-cv
+from sklearn.linear_model import LogisticRegressionCV
+from sklearn.model_selection import KFold
+kf = KFold(n_splits=10, shuffle=True, random_state=42)
+logistic = LogisticRegressionCV(Cs=2, fit_intercept=True, cv=kf, verbose =1, random_state=42)
+logistic.fit(X_train, y_train)
+print("Train Coefficient:" , logistic.coef_) #weights of each feature
+print("Train Intercept:" , logistic.intercept_) #value of intercept
+#%%
+# https://machinelearningmastery.com/repeated-k-fold-cross-validation-with-python/
+from sklearn.model_selection import cross_val_score
+from numpy import std
+cv = KFold(n_splits=10, random_state=1, shuffle=True)
+scores = cross_val_score(model_rf, X,Y, scoring='accuracy', cv=cv, n_jobs=-1)
+scores2 = cross_val_score(model_logisP, X, Y, scoring='accuracy', cv=cv, n_jobs=-1)
+# report performance
+print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))
+print('Accuracy: %.3f (%.3f)' % (mean(scores2), stdev(scores2)))