added other split_type options i.e none and none with bts

This commit is contained in:
Tanushree Tunstall 2022-07-11 19:27:14 +01:00
parent ce730fbe57
commit 1965517681
2 changed files with 75 additions and 19 deletions

View file

@ -44,9 +44,10 @@ homedir = os.path.expanduser("~")
njobs = {'n_jobs': os.cpu_count() } # the number of jobs should equal the number of CPU cores
#%% Define split_tts function #################################################
# NOTE: split_type 'none_with_bts' and 'none_only': WORK on complete data ONLY irrespective of data_type
def split_tts(ml_input_data
, data_type = ['actual', 'complete']
, split_type = ['70_30', '80_20', 'sl']
, split_type = ['70_30', '80_20', 'sl', 'none_with_bts', 'none_only']
, oversampling = True
, dst_colname = 'dst'# determine how to subset the actual vs reverse data
, target_colname = 'dst_mode'
@ -114,22 +115,62 @@ def split_tts(ml_input_data
tts_test_size = 0.2
if split_type == 'sl':
tts_test_size = 1/np.sqrt(x_ncols)
train_sl = 1 - tts_test_size
train_sl = 1 - tts_test_size # for reference
if split_type == 'none_with_bts': # always on complete data
temp_df_train = ml_input_data[ml_input_data[dst_colname].notna()]
X = temp_df_train.drop(cols_to_dropL, axis = 1)
y = temp_df_train[target_colname]
temp_df_bts = ml_input_data[ml_input_data[dst_colname].isna()]
X_bts = temp_df_bts.drop(cols_to_dropL, axis = 1)
y_bts = temp_df_bts[target_colname]
n_test_data_size = len(X) + len(X_bts)
test_data_shape = X_bts.shape
if split_type == 'none_only':
temp_df_train = ml_input_data.copy() # always complete
X = temp_df_train.drop(cols_to_dropL, axis = 1)
y = temp_df_train[target_colname]
#-------------------------
# TTS split ~ split_type
#-------------------------
#x_train, x_test, y_train, y_test # traditional var_names
# so my downstream code doesn't need to change
X, X_bts, y, y_bts = train_test_split(x_features, y_target
# so my downstream code doesn't need to change
if split_type in ['70_30', '80_20', 'sl']:
X, X_bts, y, y_bts = train_test_split(x_features, y_target
, test_size = tts_test_size
, **rs
, stratify = y_target)
n_test_data_size = len(X) + len(X_bts)
test_data_shape = X_bts.shape
yc1 = Counter(y)
yc1_ratio = yc1[0]/yc1[1]
yc2 = Counter(y_bts)
yc2_ratio = yc2[0]/yc2[1]
if split_type in ['none_only']:
outDict.update({'X' : X
, 'y' : y
})
yc2 = "NO Blind test data"
yc2_ratio = "NO Blind test data"
n_test_data_size = "NO Blind test data"
test_data_shape = "NO Blind test data"
else:
outDict.update({'X' : X
, 'X_bts' : X_bts
, 'y' : y
, 'y_bts' : y_bts
})
yc2 = Counter(y_bts)
yc2_ratio = yc2[0]/yc2[1]
###############################################################################
#======================================================
# Determine categorical and numerical features
@ -150,27 +191,21 @@ def split_tts(ml_input_data
, '\n==========================='
, '\n Resampling: NONE'
, '\nBaseline'
, '\n Baseline'
, '\n==========================='
, '\n\nTotal data size:', len(X) + len(X_bts)
, '\n\nTotal data size:', n_test_data_size
, '\n\nTrain data size:', X.shape
, '\ny_train numbers:', yc1
, '\ny_train numbers:' , yc1
, '\n\nTest data size:', X_bts.shape
, '\ny_test_numbers:', yc2
, '\n\nTest data size:', test_data_shape
, '\ny_test_numbers:' , yc2
, '\n\ny_train ratio:',yc1_ratio
, '\ny_test ratio:', yc2_ratio
, '\n\ny_train ratio:' , yc1_ratio
, '\ny_test ratio:' , yc2_ratio
, '\n-------------------------------------------------------------')
outDict.update({'X' : X
, 'X_bts' : X_bts
, 'y' : y
, 'y_bts' : y_bts
} )
if oversampling:
#######################################################################
# RESAMPLING