added test_func_combined.py

2022-07-29 00:14:39 +01:00 · 2022-07-29 00:14:39 +01:00 · 26f284d76e
commit 26f284d76e
parent 9cd6613da6
1 changed files with 130 additions and 0 deletions
--- a/scripts/ml/ml_functions/test_func_combined.py
+++ b/scripts/ml/ml_functions/test_func_combined.py
@ -0,0 +1,130 @@
+import pandas as pd
+import os, sys
+import numpy as np
+from sklearn.datasets import load_boston 
+from sklearn.ensemble import RandomForestRegressor 
+from sklearn.model_selection import train_test_split
+from sklearn.feature_selection import RFECV
+import matplotlib.pyplot as plt
+###############################################################################
+homedir = os.path.expanduser("~")
+sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml/ml_functions')
+sys.path
+
+# import
+from GetMLData import *
+from SplitTTS import *
+from MultClfs import *
+from MultClfs_CVs import *
+
+#%%
+rs = {'random_state': 42}
+skf_cv = StratifiedKFold(n_splits = 10
+                            , shuffle = True,**rs)
+#sel_cv = logo
+# sel_cv = RepeatedStratifiedKFold(n_splits = 5
+#                                   , n_repeats = 3
+#                                   , **rs)
+# param dict for getmldata()
+#%% READ data
+gene_model_paramD = {'data_combined_model'       : True
+                    , 'use_or'                   : False
+                    , 'omit_all_genomic_features': False
+                    , 'write_maskfile'           : False
+                    , 'write_outfile'            : False }
+
+#df = getmldata(gene, drug, **gene_model_paramD)
+#df = getmldata('pncA', 'pyrazinamide', **gene_model_paramD)
+df = getmldata('embB', 'ethambutol'   , **gene_model_paramD)
+#df = getmldata('katG', 'isoniazid'    , **gene_model_paramD)
+#df = getmldata('rpoB', 'rifampicin'   , **gene_model_paramD)
+#df  = getmldata('gid' , 'streptomycin' , **gene_model_paramD)
+#df  = getmldata('alr' , 'cycloserine'  , **gene_model_paramD)
+##########################
+#%% TEST different CV Thresholds for split_type = NONE
+################################################################
+Counter(df2['y'])
+Counter(df2['y_bts'])
+
+# READ Data
+spl_type = 'none'
+data_type = 'complete'
+
+df2 = split_tts(ml_input_data = combined_df 
+          , data_type = data_type
+          , split_type = spl_type
+          , oversampling = True
+          , dst_colname = 'dst'
+          , target_colname = 'dst_mode'
+          , include_gene_name = True
+          , random_state = 42 # default
+      )
+#%% Trying different CV thresholds for resampling 'none' ONLY
+fooD = MultModelsCl_CVs(input_df = df2['X']
+                , target = df2['y']
+                , skf_cv_threshold = 10 # IMP to change
+
+                , tts_split_type  = spl_type
+                , resampling_type = 'NONE' # default
+                
+                , add_cm = True  # adds confusion matrix based on cross_val_predict
+                , add_yn = True  # adds target var class numbers
+                
+                , var_type = ['mixed']
+                , scale_numeric = ['min_max']
+                , random_state = 42
+                , n_jobs = os.cpu_count() 
+                , return_formatted_output = False
+
+                )
+
+for k, v in fooD.items():
+    print('\nModel:', k
+          , '\nTRAIN MCC:', fooD[k]['test_mcc']
+          )
+
+
+
+#%% TRY with dict containing different Resampling types
+paramD = {
+        'baseline_paramD': { 'input_df'        : df2['X']
+                            , 'target'         : df2['y']
+                            , 'var_type'       : 'mixed'
+                            , 'resampling_type': 'none'}
+        
+        , 'smnc_paramD'  : { 'input_df'        : df2['X_smnc']
+                           , 'target'          : df2['y_smnc']
+                           , 'var_type'        : 'mixed'
+                           , 'resampling_type' : 'smnc'}
+        }
+
+mmDD = {}
+for k, v in paramD.items():
+    print(k)
+    all_scoresDF = pd.DataFrame()
+    for skf_cv_threshold in [3,5]:
+        print('\nRunning CV threhhold:', skf_cv_threshold)
+        current_scoreDF = MultModelsCl_CVs(**paramD[k]
+                            , skf_cv_threshold = skf_cv_threshold # IMP to change                        
+                            , tts_split_type   = spl_type
+                            #, resampling_type = 'XXXX' # default
+                            
+                            , add_cm = True  # adds confusion matrix based on cross_val_predict
+                            , add_yn = True  # adds target var class numbers
+                            
+                            #, var_type = ['mixed']
+                            , scale_numeric = ['min_max']
+                            , random_state = 42
+                            , n_jobs = os.cpu_count() 
+                            , return_formatted_output = True                             
+                            )
+        
+        all_scoresDF = pd.concat([all_scoresDF, current_scoreDF])
+    mmDD[k] = all_scoresDF
+        
+for k, v in mmDD.items():
+    print(k, v)
+    out_wf= pd.concat(mmDD, ignore_index = True)        
+    out_wf2= pd.concat(mmDD)        
+
+