added test_func_combined.py

2022-07-29 00:14:39 +01:00 · 2022-07-29 00:14:39 +01:00 · 26f284d76e
commit 26f284d76e
parent 9cd6613da6
1 changed files with 130 additions and 0 deletions
--- a/scripts/ml/ml_functions/test_func_combined.py
+++ b/scripts/ml/ml_functions/test_func_combined.py
@ -0,0 +1,130 @@
 import pandas as pd
 import os, sys
 import numpy as np
 from sklearn.datasets import load_boston 
 from sklearn.ensemble import RandomForestRegressor 
 from sklearn.model_selection import train_test_split
 from sklearn.feature_selection import RFECV
 import matplotlib.pyplot as plt
 ###############################################################################
 homedir = os.path.expanduser("~")
 sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml/ml_functions')
 sys.path
 # import
 from GetMLData import *
 from SplitTTS import *
 from MultClfs import *
 from MultClfs_CVs import *
 #%%
 rs = {'random_state': 42}
 skf_cv = StratifiedKFold(n_splits = 10
                            , shuffle = True,**rs)
 #sel_cv = logo
 # sel_cv = RepeatedStratifiedKFold(n_splits = 5
 #                                   , n_repeats = 3
 #                                   , **rs)
 # param dict for getmldata()
 #%% READ data
 gene_model_paramD = {'data_combined_model'       : True
                    , 'use_or'                   : False
                    , 'omit_all_genomic_features': False
                    , 'write_maskfile'           : False
                    , 'write_outfile'            : False }
 #df = getmldata(gene, drug, **gene_model_paramD)
 #df = getmldata('pncA', 'pyrazinamide', **gene_model_paramD)
 df = getmldata('embB', 'ethambutol'   , **gene_model_paramD)
 #df = getmldata('katG', 'isoniazid'    , **gene_model_paramD)
 #df = getmldata('rpoB', 'rifampicin'   , **gene_model_paramD)
 #df  = getmldata('gid' , 'streptomycin' , **gene_model_paramD)
 #df  = getmldata('alr' , 'cycloserine'  , **gene_model_paramD)
 ##########################
 #%% TEST different CV Thresholds for split_type = NONE
 ################################################################
 Counter(df2['y'])
 Counter(df2['y_bts'])
 # READ Data
 spl_type = 'none'
 data_type = 'complete'
 df2 = split_tts(ml_input_data = combined_df 
          , data_type = data_type
          , split_type = spl_type
          , oversampling = True
          , dst_colname = 'dst'
          , target_colname = 'dst_mode'
          , include_gene_name = True
          , random_state = 42 # default
      )
 #%% Trying different CV thresholds for resampling 'none' ONLY
 fooD = MultModelsCl_CVs(input_df = df2['X']
                , target = df2['y']
                , skf_cv_threshold = 10 # IMP to change
                , tts_split_type  = spl_type
                , resampling_type = 'NONE' # default
                , add_cm = True  # adds confusion matrix based on cross_val_predict
                , add_yn = True  # adds target var class numbers
                , var_type = ['mixed']
                , scale_numeric = ['min_max']
                , random_state = 42
                , n_jobs = os.cpu_count() 
                , return_formatted_output = False
                )
 for k, v in fooD.items():
    print('\nModel:', k
          , '\nTRAIN MCC:', fooD[k]['test_mcc']
          )
 #%% TRY with dict containing different Resampling types
 paramD = {
        'baseline_paramD': { 'input_df'        : df2['X']
                            , 'target'         : df2['y']
                            , 'var_type'       : 'mixed'
                            , 'resampling_type': 'none'}
        , 'smnc_paramD'  : { 'input_df'        : df2['X_smnc']
                           , 'target'          : df2['y_smnc']
                           , 'var_type'        : 'mixed'
                           , 'resampling_type' : 'smnc'}
        }
 mmDD = {}
 for k, v in paramD.items():
    print(k)
    all_scoresDF = pd.DataFrame()
    for skf_cv_threshold in [3,5]:
        print('\nRunning CV threhhold:', skf_cv_threshold)
        current_scoreDF = MultModelsCl_CVs(**paramD[k]
                            , skf_cv_threshold = skf_cv_threshold # IMP to change                        
                            , tts_split_type   = spl_type
                            #, resampling_type = 'XXXX' # default
                            , add_cm = True  # adds confusion matrix based on cross_val_predict
                            , add_yn = True  # adds target var class numbers
                            #, var_type = ['mixed']
                            , scale_numeric = ['min_max']
                            , random_state = 42
                            , n_jobs = os.cpu_count() 
                            , return_formatted_output = True                             
                            )
        all_scoresDF = pd.concat([all_scoresDF, current_scoreDF])
    mmDD[k] = all_scoresDF
 for k, v in mmDD.items():
    print(k, v)
    out_wf= pd.concat(mmDD, ignore_index = True)        
    out_wf2= pd.concat(mmDD)