import pandas as pd import os, sys import numpy as np from sklearn.datasets import load_boston from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import train_test_split from sklearn.feature_selection import RFECV import matplotlib.pyplot as plt ############################################################################### homedir = os.path.expanduser("~") sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml/ml_functions') sys.path # import from GetMLData import * from SplitTTS import * from MultClfs import * from MultClfs_CVs import * #%% rs = {'random_state': 42} skf_cv = StratifiedKFold(n_splits = 10 , shuffle = True,**rs) #sel_cv = logo # sel_cv = RepeatedStratifiedKFold(n_splits = 5 # , n_repeats = 3 # , **rs) # param dict for getmldata() #%% READ data gene_model_paramD = {'data_combined_model' : True , 'use_or' : False , 'omit_all_genomic_features': False , 'write_maskfile' : False , 'write_outfile' : False } #df = getmldata(gene, drug, **gene_model_paramD) #df = getmldata('pncA', 'pyrazinamide', **gene_model_paramD) df = getmldata('embB', 'ethambutol' , **gene_model_paramD) #df = getmldata('katG', 'isoniazid' , **gene_model_paramD) #df = getmldata('rpoB', 'rifampicin' , **gene_model_paramD) #df = getmldata('gid' , 'streptomycin' , **gene_model_paramD) #df = getmldata('alr' , 'cycloserine' , **gene_model_paramD) ########################## #%% TEST different CV Thresholds for split_type = NONE ################################################################ Counter(df2['y']) Counter(df2['y_bts']) # READ Data spl_type = 'none' data_type = 'complete' df2 = split_tts(ml_input_data = combined_df , data_type = data_type , split_type = spl_type , oversampling = True , dst_colname = 'dst' , target_colname = 'dst_mode' , include_gene_name = True , random_state = 42 # default ) #%% Trying different CV thresholds for resampling 'none' ONLY fooD = MultModelsCl_CVs(input_df = df2['X'] , target = df2['y'] , skf_cv_threshold = 10 # IMP to change , tts_split_type = spl_type , resampling_type = 'NONE' # default , add_cm = True # adds confusion matrix based on cross_val_predict , add_yn = True # adds target var class numbers , var_type = ['mixed'] , scale_numeric = ['min_max'] , random_state = 42 , n_jobs = os.cpu_count() , return_formatted_output = False ) for k, v in fooD.items(): print('\nModel:', k , '\nTRAIN MCC:', fooD[k]['test_mcc'] ) #%% TRY with dict containing different Resampling types paramD = { 'baseline_paramD': { 'input_df' : df2['X'] , 'target' : df2['y'] , 'var_type' : 'mixed' , 'resampling_type': 'none'} , 'smnc_paramD' : { 'input_df' : df2['X_smnc'] , 'target' : df2['y_smnc'] , 'var_type' : 'mixed' , 'resampling_type' : 'smnc'} } mmDD = {} for k, v in paramD.items(): print(k) all_scoresDF = pd.DataFrame() for skf_cv_threshold in [3,5]: print('\nRunning CV threhhold:', skf_cv_threshold) current_scoreDF = MultModelsCl_CVs(**paramD[k] , skf_cv_threshold = skf_cv_threshold # IMP to change , tts_split_type = spl_type #, resampling_type = 'XXXX' # default , add_cm = True # adds confusion matrix based on cross_val_predict , add_yn = True # adds target var class numbers #, var_type = ['mixed'] , scale_numeric = ['min_max'] , random_state = 42 , n_jobs = os.cpu_count() , return_formatted_output = True ) all_scoresDF = pd.concat([all_scoresDF, current_scoreDF]) mmDD[k] = all_scoresDF for k, v in mmDD.items(): print(k, v) out_wf= pd.concat(mmDD, ignore_index = True) out_wf2= pd.concat(mmDD)