added other split_type options i.e none and none with bts

2022-07-11 19:27:14 +01:00 · 2022-07-11 19:27:14 +01:00 · 1965517681
commit 1965517681
parent ce730fbe57
2 changed files with 75 additions and 19 deletions
--- a/scripts/ml/ml_functions/SplitTTS.py
+++ b/scripts/ml/ml_functions/SplitTTS.py
@ -44,9 +44,10 @@ homedir = os.path.expanduser("~")
 njobs = {'n_jobs': os.cpu_count() } # the number of jobs should equal the number of CPU cores

 #%% Define split_tts function #################################################
+# NOTE: split_type 'none_with_bts' and 'none_only': WORK on complete data ONLY irrespective of data_type
 def split_tts(ml_input_data
              , data_type      = ['actual', 'complete']
-              , split_type     = ['70_30', '80_20', 'sl']
+              , split_type     = ['70_30', '80_20', 'sl', 'none_with_bts', 'none_only']
              , oversampling   = True
              , dst_colname    = 'dst'# determine how to subset the actual vs reverse data
              , target_colname = 'dst_mode'
@ -114,22 +115,62 @@ def split_tts(ml_input_data
        tts_test_size = 0.2
    if split_type == 'sl':
        tts_test_size = 1/np.sqrt(x_ncols)
-        train_sl = 1 - tts_test_size
+        train_sl = 1 - tts_test_size # for reference
+        
+    if split_type == 'none_with_bts': # always on complete data
+        temp_df_train = ml_input_data[ml_input_data[dst_colname].notna()]
+        X = temp_df_train.drop(cols_to_dropL, axis = 1)
+        y = temp_df_train[target_colname]
+        
+        temp_df_bts = ml_input_data[ml_input_data[dst_colname].isna()]
+        X_bts       = temp_df_bts.drop(cols_to_dropL, axis = 1)
+        y_bts       = temp_df_bts[target_colname]
+        
+        n_test_data_size = len(X) + len(X_bts)
+        test_data_shape = X_bts.shape
+        
+    if split_type == 'none_only':
+        temp_df_train = ml_input_data.copy() # always complete
+        
+        X = temp_df_train.drop(cols_to_dropL, axis = 1)
+        y = temp_df_train[target_colname]
    
    #-------------------------
    #  TTS split ~ split_type
    #-------------------------
    #x_train, x_test, y_train, y_test # traditional var_names
-    # so my downstream code doesn't need to change    
-    X, X_bts, y, y_bts = train_test_split(x_features, y_target
+    # so my downstream code doesn't need to change
+    if split_type in ['70_30', '80_20', 'sl']:
+        X, X_bts, y, y_bts = train_test_split(x_features, y_target
                                                    , test_size = tts_test_size
                                                    , **rs
                                                    , stratify = y_target)
+        n_test_data_size = len(X) + len(X_bts)
+        test_data_shape = X_bts.shape
+        
+        
    yc1 = Counter(y)
    yc1_ratio = yc1[0]/yc1[1]
    
-    yc2 = Counter(y_bts)
-    yc2_ratio = yc2[0]/yc2[1]
+    if split_type in ['none_only']:
+        outDict.update({'X'       : X
+                , 'y'             : y
+                })
+        yc2              = "NO Blind test data"
+        yc2_ratio        = "NO Blind test data"
+        n_test_data_size = "NO Blind test data"
+        test_data_shape  = "NO Blind test data"
+        
+    else:
+        outDict.update({'X'       : X
+                , 'X_bts'         : X_bts
+                , 'y'             : y
+                , 'y_bts'         : y_bts
+                })
+        
+        yc2 = Counter(y_bts)
+        yc2_ratio = yc2[0]/yc2[1]
+        
    ###############################################################################
    #======================================================
    # Determine categorical and numerical features
@ -150,27 +191,21 @@ def split_tts(ml_input_data
          
          , '\n==========================='
          , '\n Resampling: NONE'
-          , '\nBaseline'
+          , '\n Baseline'
          , '\n==========================='
          
-          , '\n\nTotal data size:', len(X) + len(X_bts)
+          , '\n\nTotal data size:', n_test_data_size
    
          , '\n\nTrain data size:', X.shape
-          , '\ny_train numbers:', yc1
+          , '\ny_train numbers:'  , yc1
    
-          , '\n\nTest data size:', X_bts.shape
-          , '\ny_test_numbers:', yc2
+          , '\n\nTest data size:', test_data_shape
+          , '\ny_test_numbers:'  , yc2
    
-          , '\n\ny_train ratio:',yc1_ratio
-          , '\ny_test ratio:', yc2_ratio
+          , '\n\ny_train ratio:' , yc1_ratio
+          , '\ny_test ratio:'    , yc2_ratio
          , '\n-------------------------------------------------------------')
    
-    outDict.update({'X'       : X
-            , 'X_bts' : X_bts
-            , 'y'     : y
-            , 'y_bts' : y_bts
-            } ) 
-    
    if oversampling:
        #######################################################################
        #                               RESAMPLING
--- a/scripts/ml/ml_functions/test_func_singlegene.py
+++ b/scripts/ml/ml_functions/test_func_singlegene.py
@ -109,3 +109,24 @@ baz_df1.rename(columns = {0:'vdw_scaled', 1: 'ele_scaled'}, inplace = True)
 baz2 = pd.concat([baz, baz_df1], axis = 1)

 a = pd.concat([bar2, baz2], axis = 1)
+
+#%% test added split_types i.e none_with_bts and none_only
+
+spl_type = 'none_with_bts'
+spl_type = 'none_only'
+
+#data_type  = "actual"
+data_type = "complete"
+
+df2 = split_tts(df
+          , data_type = data_type # only works with complete despite what you set to
+          , split_type = spl_type
+          , oversampling = True
+          , dst_colname = 'dst'
+          , target_colname = 'dst_mode'
+          , include_gene_name = True
+          , random_state = 42 # default
+      )
+
+all(df2['X'].columns.isin(['gene_name'])) # should be False
+