trying diff cv thresholds for single gene

2022-07-28 15:19:13 +01:00 · 2022-07-28 15:19:13 +01:00 · b87f8d0295
commit b87f8d0295
parent 8d8a61675f
2 changed files with 54 additions and 461 deletions
--- a/scripts/ml/ml_functions/test_func_singlegene.py
+++ b/scripts/ml/ml_functions/test_func_singlegene.py
@ -15,8 +15,7 @@ sys.path
 from GetMLData import *
 from SplitTTS import *
 from MultClfs import *
-from MultClfs_noBTS import *
-
+from MultClfs_CVs import *

 #%%
 rs = {'random_state': 42}
@ -27,6 +26,7 @@ skf_cv = StratifiedKFold(n_splits = 10
 #                                   , n_repeats = 3
 #                                   , **rs)
 # param dict for getmldata()
+#%% READ data
 gene_model_paramD = {'data_combined_model'       : False
                    , 'use_or'                   : False
                    , 'omit_all_genomic_features': False
@ -40,7 +40,7 @@ df = getmldata('embB', 'ethambutol'   , **gene_model_paramD)
 #df = getmldata('rpoB', 'rifampicin'   , **gene_model_paramD)
 #df  = getmldata('gid' , 'streptomycin' , **gene_model_paramD)
 #df  = getmldata('alr' , 'cycloserine'  , **gene_model_paramD)
-
+#%% SPLIT, Data and Resampling types
 all(df.columns.isin(['gene_name'])) # should be False
 spl_type = '70_30'
 #spl_type = '80_20'
@ -143,11 +143,13 @@ from sklearn.utils import all_estimators
 all_clfs = all_estimators(type_filter="classifier")
 df = pd.DataFrame (all_clfs, columns = ['classifier_name', 'classifier_fn'])
 df.to_csv("Model_names_ALL.csv")
+################################################################
 #%% TEST different CV Thresholds for split_type = NONE
-
+################################################################
 Counter(df2['y'])
 Counter(df2['y_bts'])

+# READ Data
 spl_type = 'none'
 data_type = "complete"

@ -160,13 +162,13 @@ df2 = split_tts(df
          , include_gene_name = True
          , random_state = 42 # default
      )
-
-fooD = MultModelsCl_noBTS(input_df = df2['X']
+#%% Trying different CV thresholds for resampling 'none' ONLY
+fooD = MultModelsCl_CVs(input_df = df2['X']
                , target = df2['y']
                , skf_cv_threshold = 10 # IMP to change

                , tts_split_type  = spl_type
-                , resampling_type = 'XXXX' # default
+                , resampling_type = 'NONE' # default
                
                , add_cm = True  # adds confusion matrix based on cross_val_predict
                , add_yn = True  # adds target var class numbers
@ -185,7 +187,7 @@ for k, v in fooD.items():
          )

 # formatted df
-foo_df3 = MultModelsCl_noBTS(input_df = df2['X']
+foo_df3 = MultModelsCl_CVs(input_df = df2['X']
                , target = df2['y']
                , skf_cv_threshold = 5 # IMP to change

@ -203,6 +205,7 @@ foo_df3 = MultModelsCl_noBTS(input_df = df2['X']

                )

+
 dfs_combine_wf = [foo_df, foo_df2, foo_df3]

 common_cols_wf = list(set.intersection(*(set(df.columns) for df in dfs_combine_wf)))
@ -246,3 +249,46 @@ if len(common_cols_wf) == dfs_ncols_wf :
              , '\nGot:', len(combined_baseline_wf.columns))
        sys.exit('\nFIRST IF FAILS')

+#%% TRY with dict containing different Resampling types
+paramD = {
+        'baseline_paramD': { 'input_df'        : df2['X']
+                            , 'target'         : df2['y']
+                            , 'var_type'       : 'mixed'
+                            , 'resampling_type': 'none'}
+        
+        , 'smnc_paramD'  : { 'input_df'        : df2['X_smnc']
+                           , 'target'          : df2['y_smnc']
+                           , 'var_type'        : 'mixed'
+                           , 'resampling_type' : 'smnc'}
+        }
+
+mmDD = {}
+for k, v in paramD.items():
+    print(k)
+    all_scoresDF = pd.DataFrame()
+    for skf_cv_threshold in [3,5]:
+        print('\nRunning CV threhhold:', skf_cv_threshold)
+        current_scoreDF = MultModelsCl_CVs(**paramD[k]
+                            , skf_cv_threshold = skf_cv_threshold # IMP to change                        
+                            , tts_split_type   = spl_type
+                            #, resampling_type = 'XXXX' # default
+                            
+                            , add_cm = True  # adds confusion matrix based on cross_val_predict
+                            , add_yn = True  # adds target var class numbers
+                            
+                            #, var_type = ['mixed']
+                            , scale_numeric = ['min_max']
+                            , random_state = 42
+                            , n_jobs = os.cpu_count() 
+                            , return_formatted_output = True                             
+                            )
+        
+        all_scoresDF = pd.concat([all_scoresDF, current_scoreDF])
+    mmDD[k] = all_scoresDF
+        
+for k, v in mmDD.items():
+    print(k, v)
+    out_wf= pd.concat(mmDD, ignore_index = True)        
+    out_wf2= pd.concat(mmDD)        
+
+