added reverse traininig as split type in SplitTTS.py

2022-07-11 20:03:06 +01:00 · 2022-07-11 20:03:06 +01:00 · 6950c4b057
commit 6950c4b057
parent 1965517681
2 changed files with 22 additions and 11 deletions
--- a/scripts/ml/ml_functions/SplitTTS.py
+++ b/scripts/ml/ml_functions/SplitTTS.py
@ -47,7 +47,7 @@ njobs = {'n_jobs': os.cpu_count() } # the number of jobs should equal the number
 # NOTE: split_type 'none_with_bts' and 'none_only': WORK on complete data ONLY irrespective of data_type
 def split_tts(ml_input_data
              , data_type      = ['actual', 'complete']
-              , split_type     = ['70_30', '80_20', 'sl', 'none_with_bts', 'none_only']
+              , split_type     = ['70_30', '80_20', 'sl', 'none_with_bts', 'none_only', 'reverse']
              , oversampling   = True
              , dst_colname    = 'dst'# determine how to subset the actual vs reverse data
              , target_colname = 'dst_mode'
@ -128,10 +128,21 @@ def split_tts(ml_input_data
        
        n_test_data_size = len(X) + len(X_bts)
        test_data_shape = X_bts.shape
+
+    if split_type == 'rt': # always on complete data
+        temp_df_train = ml_input_data[ml_input_data[dst_colname].isna()]
+        X = temp_df_train.drop(cols_to_dropL, axis = 1)
+        y = temp_df_train[target_colname]
        
+        temp_df_bts = ml_input_data[ml_input_data[dst_colname].notna()]
+        X_bts       = temp_df_bts.drop(cols_to_dropL, axis = 1)
+        y_bts       = temp_df_bts[target_colname]
+        
+        n_test_data_size = len(X) + len(X_bts)
+        test_data_shape = X_bts.shape
+                
    if split_type == 'none_only':
        temp_df_train = ml_input_data.copy() # always complete
-        
        X = temp_df_train.drop(cols_to_dropL, axis = 1)
        y = temp_df_train[target_colname]
    
@ -194,16 +205,16 @@ def split_tts(ml_input_data
          , '\n Baseline'
          , '\n==========================='
          
-          , '\n\nTotal data size:', n_test_data_size
+          , '\ninput data size:'   , len(ml_input_data)
    
-          , '\n\nTrain data size:', X.shape
-          , '\ny_train numbers:'  , yc1
+          , '\n\nTrain data size:' , X.shape
+          , '\ny_train numbers:'   , yc1
    
-          , '\n\nTest data size:', test_data_shape
-          , '\ny_test_numbers:'  , yc2
+          , '\n\nTest data size:'  , test_data_shape
+          , '\ny_test_numbers:'    , yc2
    
-          , '\n\ny_train ratio:' , yc1_ratio
-          , '\ny_test ratio:'    , yc2_ratio
+          , '\n\ny_train ratio:'   , yc1_ratio
+          , '\ny_test ratio:'      , yc2_ratio
          , '\n-------------------------------------------------------------')
    
    if oversampling:
--- a/scripts/ml/ml_functions/test_func_singlegene.py
+++ b/scripts/ml/ml_functions/test_func_singlegene.py
@ -111,9 +111,9 @@ baz2 = pd.concat([baz, baz_df1], axis = 1)
 a = pd.concat([bar2, baz2], axis = 1)

 #%% test added split_types i.e none_with_bts and none_only
-
-spl_type = 'none_with_bts'
 spl_type = 'none_only'
+spl_type = 'none_with_bts'
+spl_type = 'rt'

 #data_type  = "actual"
 data_type = "complete"