checking the splitTTS script to make sure other splits have been factored in
This commit is contained in:
parent
f4cab1fdfb
commit
63c8876764
2 changed files with 21 additions and 21 deletions
|
@ -47,7 +47,7 @@ njobs = {'n_jobs': os.cpu_count() } # the number of jobs should equal the number
|
|||
# NOTE: split_type 'none_with_bts' and 'none_only': WORK on complete data ONLY irrespective of data_type
|
||||
def split_tts(ml_input_data
|
||||
, data_type = ['actual', 'complete']
|
||||
, split_type = ['70_30', '80_20', 'sl', 'none_with_bts', 'none_only', 'reverse']
|
||||
, split_type = ['70_30', '80_20', 'sl', 'rt', 'none_bts', 'none']
|
||||
, oversampling = True
|
||||
, dst_colname = 'dst'# determine how to subset the actual vs reverse data
|
||||
, target_colname = 'dst_mode'
|
||||
|
@ -116,19 +116,7 @@ def split_tts(ml_input_data
|
|||
if split_type == 'sl':
|
||||
tts_test_size = 1/np.sqrt(x_ncols)
|
||||
train_sl = 1 - tts_test_size # for reference
|
||||
|
||||
if split_type == 'none_with_bts': # always on complete data
|
||||
temp_df_train = ml_input_data[ml_input_data[dst_colname].notna()]
|
||||
X = temp_df_train.drop(cols_to_dropL, axis = 1)
|
||||
y = temp_df_train[target_colname]
|
||||
|
||||
temp_df_bts = ml_input_data[ml_input_data[dst_colname].isna()]
|
||||
X_bts = temp_df_bts.drop(cols_to_dropL, axis = 1)
|
||||
y_bts = temp_df_bts[target_colname]
|
||||
|
||||
n_test_data_size = len(X) + len(X_bts)
|
||||
test_data_shape = X_bts.shape
|
||||
|
||||
|
||||
if split_type == 'rt': # always on complete data
|
||||
temp_df_train = ml_input_data[ml_input_data[dst_colname].isna()]
|
||||
X = temp_df_train.drop(cols_to_dropL, axis = 1)
|
||||
|
@ -138,10 +126,22 @@ def split_tts(ml_input_data
|
|||
X_bts = temp_df_bts.drop(cols_to_dropL, axis = 1)
|
||||
y_bts = temp_df_bts[target_colname]
|
||||
|
||||
n_test_data_size = len(X) + len(X_bts)
|
||||
test_data_shape = X_bts.shape
|
||||
|
||||
if split_type == 'none_bts': # always on complete data
|
||||
temp_df_train = ml_input_data[ml_input_data[dst_colname].notna()]
|
||||
X = temp_df_train.drop(cols_to_dropL, axis = 1)
|
||||
y = temp_df_train[target_colname]
|
||||
|
||||
temp_df_bts = ml_input_data[ml_input_data[dst_colname].isna()]
|
||||
X_bts = temp_df_bts.drop(cols_to_dropL, axis = 1)
|
||||
y_bts = temp_df_bts[target_colname]
|
||||
|
||||
n_test_data_size = len(X) + len(X_bts)
|
||||
test_data_shape = X_bts.shape
|
||||
|
||||
if split_type == 'none_only':
|
||||
if split_type == 'none': # always on complete data
|
||||
temp_df_train = ml_input_data.copy() # always complete
|
||||
X = temp_df_train.drop(cols_to_dropL, axis = 1)
|
||||
y = temp_df_train[target_colname]
|
||||
|
@ -163,10 +163,10 @@ def split_tts(ml_input_data
|
|||
yc1 = Counter(y)
|
||||
yc1_ratio = yc1[0]/yc1[1]
|
||||
|
||||
if split_type in ['none_only']:
|
||||
if split_type in ['none']:
|
||||
outDict.update({'X' : X
|
||||
, 'y' : y
|
||||
})
|
||||
, 'y' : y })
|
||||
|
||||
yc2 = "NO Blind test data"
|
||||
yc2_ratio = "NO Blind test data"
|
||||
n_test_data_size = "NO Blind test data"
|
||||
|
|
|
@ -41,9 +41,9 @@ gene_model_paramD = {'data_combined_model' : False
|
|||
###############################################################################
|
||||
#ml_genes = ["pncA", "embB", "katG", "rpoB", "gid"]
|
||||
|
||||
ml_gene_drugD = {#'pncA' : 'pyrazinamide'
|
||||
# 'embB' : 'ethambutol'
|
||||
'katG' : 'isoniazid'
|
||||
ml_gene_drugD = {'pncA' : 'pyrazinamide'
|
||||
, 'embB' : 'ethambutol'
|
||||
, 'katG' : 'isoniazid'
|
||||
, 'rpoB' : 'rifampicin'
|
||||
, 'gid' : 'streptomycin'
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue