From 9bc26c19471bf4d1191ef971990c1568f31e643e Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Sat, 18 Jun 2022 19:35:49 +0100
Subject: [PATCH] slight formatting for existing scripts

---
 scripts/ml/ml_data_7030.py        |  1 +
 scripts/ml/ml_data_8020.py        |  1 +
 scripts/ml/ml_data_sl.py          |  8 +++-----
 scripts/ml/running_ml_scripts.txt | 30 ++++++++++++++++++++++++++++--
 4 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/scripts/ml/ml_data_7030.py b/scripts/ml/ml_data_7030.py
index 4976439..83a792a 100644
--- a/scripts/ml/ml_data_7030.py
+++ b/scripts/ml/ml_data_7030.py
@@ -667,6 +667,7 @@ def setvars(gene,drug):
 
     print('\n-------------------------------------------------------------'
           , '\nSuccessfully split data with stratification: 70/30'
+          , '\nInput features data size:', x_features.shape
           , '\nTrain data size:', X.shape
           , '\nTest data size:', X_bts.shape
           , '\ny_train numbers:', yc1
diff --git a/scripts/ml/ml_data_8020.py b/scripts/ml/ml_data_8020.py
index 9049345..4928839 100644
--- a/scripts/ml/ml_data_8020.py
+++ b/scripts/ml/ml_data_8020.py
@@ -655,6 +655,7 @@ def setvars(gene,drug):
 
     print('\n-------------------------------------------------------------'
           , '\nSuccessfully split data with stratification: 80/20 '
+          , '\nInput features data size:', x_features.shape
           , '\nTrain data size:', X.shape
           , '\nTest data size:', X_bts.shape
           , '\ny_train numbers:', yc1
diff --git a/scripts/ml/ml_data_sl.py b/scripts/ml/ml_data_sl.py
index b8a9ded..e850ab5 100644
--- a/scripts/ml/ml_data_sl.py
+++ b/scripts/ml/ml_data_sl.py
@@ -553,9 +553,7 @@ def setvars(gene,drug):
     # Training and BLIND test set: scaling law split
     # https://towardsdatascience.com/finally-why-we-use-an-80-20-split-for-training-and-test-data-plus-an-alternative-method-oh-yes-edc77e96295d
     
-    # Throw away previous blind_test_df, and call the 30% data as blind_test
-    # as these were imputed values and initial analysis shows that this
-    # is not very representative
+    # test data size ~ 1/sqrt(features NOT including target variable)
     #================================================================
     my_df_ml[drug].isna().sum()
     #    blind_test_df = my_df_ml[my_df_ml[drug].isna()]
@@ -650,7 +648,6 @@ def setvars(gene,drug):
         x_ncols = len(x_features.columns)
         print('\nNo. of columns for x_features:', x_ncols)
         # NEED It for scaling law split
-        #https://towardsdatascience.com/finally-why-we-use-an-80-20-split-for-training-and-test-data-plus-an-alternative-method-oh-yes-edc77e96295d
     else:
         sys.exit('\nFAIL: x_features has target variable included. FIX it and rerun!')
         
@@ -670,7 +667,8 @@ def setvars(gene,drug):
     yc2_ratio = yc2[0]/yc2[1]
 
     print('\n-------------------------------------------------------------'
-          , '\nSuccessfully split data according to scaling law: 1/np.sqrt(x_ncols)'
+          , '\nSuccessfully split data with stratification according to scaling law: 1/sqrt(x_ncols)'
+          , '\nInput features data size:', x_features.shape          
           , '\nTrain data size:', X.shape
           , '\nTest data size:', sl_test_size, '    ', X_bts.shape
           , '\ny_train numbers:', yc1
diff --git a/scripts/ml/running_ml_scripts.txt b/scripts/ml/running_ml_scripts.txt
index ae8736f..f9b271a 100644
--- a/scripts/ml/running_ml_scripts.txt
+++ b/scripts/ml/running_ml_scripts.txt
@@ -90,8 +90,34 @@
 ./rpob_rt.py 2>&1 | tee log_rpob_rt.txt
 ./alr_rt.py 2>&1 | tee log_alr_rt.txt
 
+########################################################################
+# COMPLETE Data: actual + na i.e imputed
+########################################################################
 
+=================================
+# Split: 70/30 [COMPLETE DATA]
+# All features including AA index
+# Date: 18/05/2022
+# captures error: 2>$1
+=================================
+./pnca_cd_7030.py 2>&1 | tee log_pnca_cd_7030.txt
+./embb_cd_7030.py 2>&1 | tee log_embb_cd_7030.txt
+./gid_cd_7030.py 2>&1 | tee log_gid_cd_7030.txt
+./katg_cd_7030.py 2>&1 | tee log_katg_cd_7030.txt
+./rpob_cd_7030.py 2>&1 | tee log_rpob_cd_7030.txt
+./alr_cd_7030.py 2>&1 | tee log_alr_cd_7030.txt
 
-
-
+########################################################################
+=================================
+# Split: 80/20 [COMPLETE DATA]
+# All features including AA index
+# Date: 18/05/2022
+# captures error: 2>$1
+=================================
+./pnca_cd_8020.py 2>&1 | tee log_pnca_cd_8020.txt
+./embb_cd_8020.py 2>&1 | tee log_embb_cd_8020.txt
+./gid_cd_8020.py 2>&1 | tee log_gid_cd_8020.txt
+./katg_cd_8020.py 2>&1 | tee log_katg_cd_8020.txt
+./rpob_cd_8020.py 2>&1 | tee log_rpob_cd_8020.txt
+./alr_cd_8020.py 2>&1 | tee log_alr_cd_8020.txt