added other split_type options i.e none and none with bts
This commit is contained in:
parent
ce730fbe57
commit
1965517681
2 changed files with 75 additions and 19 deletions
|
@ -44,9 +44,10 @@ homedir = os.path.expanduser("~")
|
||||||
njobs = {'n_jobs': os.cpu_count() } # the number of jobs should equal the number of CPU cores
|
njobs = {'n_jobs': os.cpu_count() } # the number of jobs should equal the number of CPU cores
|
||||||
|
|
||||||
#%% Define split_tts function #################################################
|
#%% Define split_tts function #################################################
|
||||||
|
# NOTE: split_type 'none_with_bts' and 'none_only': WORK on complete data ONLY irrespective of data_type
|
||||||
def split_tts(ml_input_data
|
def split_tts(ml_input_data
|
||||||
, data_type = ['actual', 'complete']
|
, data_type = ['actual', 'complete']
|
||||||
, split_type = ['70_30', '80_20', 'sl']
|
, split_type = ['70_30', '80_20', 'sl', 'none_with_bts', 'none_only']
|
||||||
, oversampling = True
|
, oversampling = True
|
||||||
, dst_colname = 'dst'# determine how to subset the actual vs reverse data
|
, dst_colname = 'dst'# determine how to subset the actual vs reverse data
|
||||||
, target_colname = 'dst_mode'
|
, target_colname = 'dst_mode'
|
||||||
|
@ -114,22 +115,62 @@ def split_tts(ml_input_data
|
||||||
tts_test_size = 0.2
|
tts_test_size = 0.2
|
||||||
if split_type == 'sl':
|
if split_type == 'sl':
|
||||||
tts_test_size = 1/np.sqrt(x_ncols)
|
tts_test_size = 1/np.sqrt(x_ncols)
|
||||||
train_sl = 1 - tts_test_size
|
train_sl = 1 - tts_test_size # for reference
|
||||||
|
|
||||||
|
if split_type == 'none_with_bts': # always on complete data
|
||||||
|
temp_df_train = ml_input_data[ml_input_data[dst_colname].notna()]
|
||||||
|
X = temp_df_train.drop(cols_to_dropL, axis = 1)
|
||||||
|
y = temp_df_train[target_colname]
|
||||||
|
|
||||||
|
temp_df_bts = ml_input_data[ml_input_data[dst_colname].isna()]
|
||||||
|
X_bts = temp_df_bts.drop(cols_to_dropL, axis = 1)
|
||||||
|
y_bts = temp_df_bts[target_colname]
|
||||||
|
|
||||||
|
n_test_data_size = len(X) + len(X_bts)
|
||||||
|
test_data_shape = X_bts.shape
|
||||||
|
|
||||||
|
if split_type == 'none_only':
|
||||||
|
temp_df_train = ml_input_data.copy() # always complete
|
||||||
|
|
||||||
|
X = temp_df_train.drop(cols_to_dropL, axis = 1)
|
||||||
|
y = temp_df_train[target_colname]
|
||||||
|
|
||||||
#-------------------------
|
#-------------------------
|
||||||
# TTS split ~ split_type
|
# TTS split ~ split_type
|
||||||
#-------------------------
|
#-------------------------
|
||||||
#x_train, x_test, y_train, y_test # traditional var_names
|
#x_train, x_test, y_train, y_test # traditional var_names
|
||||||
# so my downstream code doesn't need to change
|
# so my downstream code doesn't need to change
|
||||||
|
if split_type in ['70_30', '80_20', 'sl']:
|
||||||
X, X_bts, y, y_bts = train_test_split(x_features, y_target
|
X, X_bts, y, y_bts = train_test_split(x_features, y_target
|
||||||
, test_size = tts_test_size
|
, test_size = tts_test_size
|
||||||
, **rs
|
, **rs
|
||||||
, stratify = y_target)
|
, stratify = y_target)
|
||||||
|
n_test_data_size = len(X) + len(X_bts)
|
||||||
|
test_data_shape = X_bts.shape
|
||||||
|
|
||||||
|
|
||||||
yc1 = Counter(y)
|
yc1 = Counter(y)
|
||||||
yc1_ratio = yc1[0]/yc1[1]
|
yc1_ratio = yc1[0]/yc1[1]
|
||||||
|
|
||||||
|
if split_type in ['none_only']:
|
||||||
|
outDict.update({'X' : X
|
||||||
|
, 'y' : y
|
||||||
|
})
|
||||||
|
yc2 = "NO Blind test data"
|
||||||
|
yc2_ratio = "NO Blind test data"
|
||||||
|
n_test_data_size = "NO Blind test data"
|
||||||
|
test_data_shape = "NO Blind test data"
|
||||||
|
|
||||||
|
else:
|
||||||
|
outDict.update({'X' : X
|
||||||
|
, 'X_bts' : X_bts
|
||||||
|
, 'y' : y
|
||||||
|
, 'y_bts' : y_bts
|
||||||
|
})
|
||||||
|
|
||||||
yc2 = Counter(y_bts)
|
yc2 = Counter(y_bts)
|
||||||
yc2_ratio = yc2[0]/yc2[1]
|
yc2_ratio = yc2[0]/yc2[1]
|
||||||
|
|
||||||
###############################################################################
|
###############################################################################
|
||||||
#======================================================
|
#======================================================
|
||||||
# Determine categorical and numerical features
|
# Determine categorical and numerical features
|
||||||
|
@ -153,24 +194,18 @@ def split_tts(ml_input_data
|
||||||
, '\n Baseline'
|
, '\n Baseline'
|
||||||
, '\n==========================='
|
, '\n==========================='
|
||||||
|
|
||||||
, '\n\nTotal data size:', len(X) + len(X_bts)
|
, '\n\nTotal data size:', n_test_data_size
|
||||||
|
|
||||||
, '\n\nTrain data size:', X.shape
|
, '\n\nTrain data size:', X.shape
|
||||||
, '\ny_train numbers:' , yc1
|
, '\ny_train numbers:' , yc1
|
||||||
|
|
||||||
, '\n\nTest data size:', X_bts.shape
|
, '\n\nTest data size:', test_data_shape
|
||||||
, '\ny_test_numbers:' , yc2
|
, '\ny_test_numbers:' , yc2
|
||||||
|
|
||||||
, '\n\ny_train ratio:' , yc1_ratio
|
, '\n\ny_train ratio:' , yc1_ratio
|
||||||
, '\ny_test ratio:' , yc2_ratio
|
, '\ny_test ratio:' , yc2_ratio
|
||||||
, '\n-------------------------------------------------------------')
|
, '\n-------------------------------------------------------------')
|
||||||
|
|
||||||
outDict.update({'X' : X
|
|
||||||
, 'X_bts' : X_bts
|
|
||||||
, 'y' : y
|
|
||||||
, 'y_bts' : y_bts
|
|
||||||
} )
|
|
||||||
|
|
||||||
if oversampling:
|
if oversampling:
|
||||||
#######################################################################
|
#######################################################################
|
||||||
# RESAMPLING
|
# RESAMPLING
|
||||||
|
|
|
@ -109,3 +109,24 @@ baz_df1.rename(columns = {0:'vdw_scaled', 1: 'ele_scaled'}, inplace = True)
|
||||||
baz2 = pd.concat([baz, baz_df1], axis = 1)
|
baz2 = pd.concat([baz, baz_df1], axis = 1)
|
||||||
|
|
||||||
a = pd.concat([bar2, baz2], axis = 1)
|
a = pd.concat([bar2, baz2], axis = 1)
|
||||||
|
|
||||||
|
#%% test added split_types i.e none_with_bts and none_only
|
||||||
|
|
||||||
|
spl_type = 'none_with_bts'
|
||||||
|
spl_type = 'none_only'
|
||||||
|
|
||||||
|
#data_type = "actual"
|
||||||
|
data_type = "complete"
|
||||||
|
|
||||||
|
df2 = split_tts(df
|
||||||
|
, data_type = data_type # only works with complete despite what you set to
|
||||||
|
, split_type = spl_type
|
||||||
|
, oversampling = True
|
||||||
|
, dst_colname = 'dst'
|
||||||
|
, target_colname = 'dst_mode'
|
||||||
|
, include_gene_name = True
|
||||||
|
, random_state = 42 # default
|
||||||
|
)
|
||||||
|
|
||||||
|
all(df2['X'].columns.isin(['gene_name'])) # should be False
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue