diff --git a/scripts/ml/ml_functions/SplitTTS.py b/scripts/ml/ml_functions/SplitTTS.py index 68f6470..64035ab 100644 --- a/scripts/ml/ml_functions/SplitTTS.py +++ b/scripts/ml/ml_functions/SplitTTS.py @@ -47,7 +47,7 @@ njobs = {'n_jobs': os.cpu_count() } # the number of jobs should equal the number # NOTE: split_type 'none_with_bts' and 'none_only': WORK on complete data ONLY irrespective of data_type def split_tts(ml_input_data , data_type = ['actual', 'complete'] - , split_type = ['70_30', '80_20', 'sl', 'none_with_bts', 'none_only'] + , split_type = ['70_30', '80_20', 'sl', 'none_with_bts', 'none_only', 'reverse'] , oversampling = True , dst_colname = 'dst'# determine how to subset the actual vs reverse data , target_colname = 'dst_mode' @@ -128,10 +128,21 @@ def split_tts(ml_input_data n_test_data_size = len(X) + len(X_bts) test_data_shape = X_bts.shape + + if split_type == 'rt': # always on complete data + temp_df_train = ml_input_data[ml_input_data[dst_colname].isna()] + X = temp_df_train.drop(cols_to_dropL, axis = 1) + y = temp_df_train[target_colname] + temp_df_bts = ml_input_data[ml_input_data[dst_colname].notna()] + X_bts = temp_df_bts.drop(cols_to_dropL, axis = 1) + y_bts = temp_df_bts[target_colname] + + n_test_data_size = len(X) + len(X_bts) + test_data_shape = X_bts.shape + if split_type == 'none_only': temp_df_train = ml_input_data.copy() # always complete - X = temp_df_train.drop(cols_to_dropL, axis = 1) y = temp_df_train[target_colname] @@ -194,16 +205,16 @@ def split_tts(ml_input_data , '\n Baseline' , '\n===========================' - , '\n\nTotal data size:', n_test_data_size + , '\ninput data size:' , len(ml_input_data) - , '\n\nTrain data size:', X.shape - , '\ny_train numbers:' , yc1 + , '\n\nTrain data size:' , X.shape + , '\ny_train numbers:' , yc1 - , '\n\nTest data size:', test_data_shape - , '\ny_test_numbers:' , yc2 + , '\n\nTest data size:' , test_data_shape + , '\ny_test_numbers:' , yc2 - , '\n\ny_train ratio:' , yc1_ratio - , '\ny_test ratio:' , yc2_ratio + , '\n\ny_train ratio:' , yc1_ratio + , '\ny_test ratio:' , yc2_ratio , '\n-------------------------------------------------------------') if oversampling: diff --git a/scripts/ml/ml_functions/test_func_singlegene.py b/scripts/ml/ml_functions/test_func_singlegene.py index b86c60b..2b1f3ac 100644 --- a/scripts/ml/ml_functions/test_func_singlegene.py +++ b/scripts/ml/ml_functions/test_func_singlegene.py @@ -111,9 +111,9 @@ baz2 = pd.concat([baz, baz_df1], axis = 1) a = pd.concat([bar2, baz2], axis = 1) #%% test added split_types i.e none_with_bts and none_only - -spl_type = 'none_with_bts' spl_type = 'none_only' +spl_type = 'none_with_bts' +spl_type = 'rt' #data_type = "actual" data_type = "complete"