diff --git a/scripts/ml/ml_functions/SplitTTS.py b/scripts/ml/ml_functions/SplitTTS.py index 70e47ea..68f6470 100644 --- a/scripts/ml/ml_functions/SplitTTS.py +++ b/scripts/ml/ml_functions/SplitTTS.py @@ -44,9 +44,10 @@ homedir = os.path.expanduser("~") njobs = {'n_jobs': os.cpu_count() } # the number of jobs should equal the number of CPU cores #%% Define split_tts function ################################################# +# NOTE: split_type 'none_with_bts' and 'none_only': WORK on complete data ONLY irrespective of data_type def split_tts(ml_input_data , data_type = ['actual', 'complete'] - , split_type = ['70_30', '80_20', 'sl'] + , split_type = ['70_30', '80_20', 'sl', 'none_with_bts', 'none_only'] , oversampling = True , dst_colname = 'dst'# determine how to subset the actual vs reverse data , target_colname = 'dst_mode' @@ -114,22 +115,62 @@ def split_tts(ml_input_data tts_test_size = 0.2 if split_type == 'sl': tts_test_size = 1/np.sqrt(x_ncols) - train_sl = 1 - tts_test_size + train_sl = 1 - tts_test_size # for reference + + if split_type == 'none_with_bts': # always on complete data + temp_df_train = ml_input_data[ml_input_data[dst_colname].notna()] + X = temp_df_train.drop(cols_to_dropL, axis = 1) + y = temp_df_train[target_colname] + + temp_df_bts = ml_input_data[ml_input_data[dst_colname].isna()] + X_bts = temp_df_bts.drop(cols_to_dropL, axis = 1) + y_bts = temp_df_bts[target_colname] + + n_test_data_size = len(X) + len(X_bts) + test_data_shape = X_bts.shape + + if split_type == 'none_only': + temp_df_train = ml_input_data.copy() # always complete + + X = temp_df_train.drop(cols_to_dropL, axis = 1) + y = temp_df_train[target_colname] #------------------------- # TTS split ~ split_type #------------------------- #x_train, x_test, y_train, y_test # traditional var_names - # so my downstream code doesn't need to change - X, X_bts, y, y_bts = train_test_split(x_features, y_target + # so my downstream code doesn't need to change + if split_type in ['70_30', '80_20', 'sl']: + X, X_bts, y, y_bts = train_test_split(x_features, y_target , test_size = tts_test_size , **rs , stratify = y_target) + n_test_data_size = len(X) + len(X_bts) + test_data_shape = X_bts.shape + + yc1 = Counter(y) yc1_ratio = yc1[0]/yc1[1] - yc2 = Counter(y_bts) - yc2_ratio = yc2[0]/yc2[1] + if split_type in ['none_only']: + outDict.update({'X' : X + , 'y' : y + }) + yc2 = "NO Blind test data" + yc2_ratio = "NO Blind test data" + n_test_data_size = "NO Blind test data" + test_data_shape = "NO Blind test data" + + else: + outDict.update({'X' : X + , 'X_bts' : X_bts + , 'y' : y + , 'y_bts' : y_bts + }) + + yc2 = Counter(y_bts) + yc2_ratio = yc2[0]/yc2[1] + ############################################################################### #====================================================== # Determine categorical and numerical features @@ -150,27 +191,21 @@ def split_tts(ml_input_data , '\n===========================' , '\n Resampling: NONE' - , '\nBaseline' + , '\n Baseline' , '\n===========================' - , '\n\nTotal data size:', len(X) + len(X_bts) + , '\n\nTotal data size:', n_test_data_size , '\n\nTrain data size:', X.shape - , '\ny_train numbers:', yc1 + , '\ny_train numbers:' , yc1 - , '\n\nTest data size:', X_bts.shape - , '\ny_test_numbers:', yc2 + , '\n\nTest data size:', test_data_shape + , '\ny_test_numbers:' , yc2 - , '\n\ny_train ratio:',yc1_ratio - , '\ny_test ratio:', yc2_ratio + , '\n\ny_train ratio:' , yc1_ratio + , '\ny_test ratio:' , yc2_ratio , '\n-------------------------------------------------------------') - outDict.update({'X' : X - , 'X_bts' : X_bts - , 'y' : y - , 'y_bts' : y_bts - } ) - if oversampling: ####################################################################### # RESAMPLING diff --git a/scripts/ml/ml_functions/test_func_singlegene.py b/scripts/ml/ml_functions/test_func_singlegene.py index 729fafe..b86c60b 100644 --- a/scripts/ml/ml_functions/test_func_singlegene.py +++ b/scripts/ml/ml_functions/test_func_singlegene.py @@ -109,3 +109,24 @@ baz_df1.rename(columns = {0:'vdw_scaled', 1: 'ele_scaled'}, inplace = True) baz2 = pd.concat([baz, baz_df1], axis = 1) a = pd.concat([bar2, baz2], axis = 1) + +#%% test added split_types i.e none_with_bts and none_only + +spl_type = 'none_with_bts' +spl_type = 'none_only' + +#data_type = "actual" +data_type = "complete" + +df2 = split_tts(df + , data_type = data_type # only works with complete despite what you set to + , split_type = spl_type + , oversampling = True + , dst_colname = 'dst' + , target_colname = 'dst_mode' + , include_gene_name = True + , random_state = 42 # default + ) + +all(df2['X'].columns.isin(['gene_name'])) # should be False +