slight formatting for existing scripts

2022-06-18 19:35:49 +01:00 · 2022-06-18 19:35:49 +01:00 · 9bc26c1947
commit 9bc26c1947
parent a53fce5455
4 changed files with 33 additions and 7 deletions
--- a/scripts/ml/ml_data_7030.py
+++ b/scripts/ml/ml_data_7030.py
@ -667,6 +667,7 @@ def setvars(gene,drug):

    print('\n-------------------------------------------------------------'
          , '\nSuccessfully split data with stratification: 70/30'
+          , '\nInput features data size:', x_features.shape
          , '\nTrain data size:', X.shape
          , '\nTest data size:', X_bts.shape
          , '\ny_train numbers:', yc1
--- a/scripts/ml/ml_data_8020.py
+++ b/scripts/ml/ml_data_8020.py
@ -655,6 +655,7 @@ def setvars(gene,drug):

    print('\n-------------------------------------------------------------'
          , '\nSuccessfully split data with stratification: 80/20 '
+          , '\nInput features data size:', x_features.shape
          , '\nTrain data size:', X.shape
          , '\nTest data size:', X_bts.shape
          , '\ny_train numbers:', yc1
--- a/scripts/ml/ml_data_sl.py
+++ b/scripts/ml/ml_data_sl.py
@ -553,9 +553,7 @@ def setvars(gene,drug):
    # Training and BLIND test set: scaling law split
    # https://towardsdatascience.com/finally-why-we-use-an-80-20-split-for-training-and-test-data-plus-an-alternative-method-oh-yes-edc77e96295d
    
-    # Throw away previous blind_test_df, and call the 30% data as blind_test
-    # as these were imputed values and initial analysis shows that this
-    # is not very representative
+    # test data size ~ 1/sqrt(features NOT including target variable)
    #================================================================
    my_df_ml[drug].isna().sum()
    #    blind_test_df = my_df_ml[my_df_ml[drug].isna()]
@ -650,7 +648,6 @@ def setvars(gene,drug):
        x_ncols = len(x_features.columns)
        print('\nNo. of columns for x_features:', x_ncols)
        # NEED It for scaling law split
-        #https://towardsdatascience.com/finally-why-we-use-an-80-20-split-for-training-and-test-data-plus-an-alternative-method-oh-yes-edc77e96295d
    else:
        sys.exit('\nFAIL: x_features has target variable included. FIX it and rerun!')
        
@ -670,7 +667,8 @@ def setvars(gene,drug):
    yc2_ratio = yc2[0]/yc2[1]

    print('\n-------------------------------------------------------------'
-          , '\nSuccessfully split data according to scaling law: 1/np.sqrt(x_ncols)'
+          , '\nSuccessfully split data with stratification according to scaling law: 1/sqrt(x_ncols)'
+          , '\nInput features data size:', x_features.shape          
          , '\nTrain data size:', X.shape
          , '\nTest data size:', sl_test_size, '    ', X_bts.shape
          , '\ny_train numbers:', yc1
--- a/scripts/ml/running_ml_scripts.txt
+++ b/scripts/ml/running_ml_scripts.txt
@ -90,8 +90,34 @@
 ./rpob_rt.py 2>&1 | tee log_rpob_rt.txt
 ./alr_rt.py 2>&1 | tee log_alr_rt.txt

+########################################################################
+# COMPLETE Data: actual + na i.e imputed
+########################################################################

+=================================
+# Split: 70/30 [COMPLETE DATA]
+# All features including AA index
+# Date: 18/05/2022
+# captures error: 2>$1
+=================================
+./pnca_cd_7030.py 2>&1 | tee log_pnca_cd_7030.txt
+./embb_cd_7030.py 2>&1 | tee log_embb_cd_7030.txt
+./gid_cd_7030.py 2>&1 | tee log_gid_cd_7030.txt
+./katg_cd_7030.py 2>&1 | tee log_katg_cd_7030.txt
+./rpob_cd_7030.py 2>&1 | tee log_rpob_cd_7030.txt
+./alr_cd_7030.py 2>&1 | tee log_alr_cd_7030.txt

-
-
+########################################################################
+=================================
+# Split: 80/20 [COMPLETE DATA]
+# All features including AA index
+# Date: 18/05/2022
+# captures error: 2>$1
+=================================
+./pnca_cd_8020.py 2>&1 | tee log_pnca_cd_8020.txt
+./embb_cd_8020.py 2>&1 | tee log_embb_cd_8020.txt
+./gid_cd_8020.py 2>&1 | tee log_gid_cd_8020.txt
+./katg_cd_8020.py 2>&1 | tee log_katg_cd_8020.txt
+./rpob_cd_8020.py 2>&1 | tee log_rpob_cd_8020.txt
+./alr_cd_8020.py 2>&1 | tee log_alr_cd_8020.txt