import pandas as pd import os, sys import numpy as np from sklearn.datasets import load_boston from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import train_test_split from sklearn.feature_selection import RFECV import matplotlib.pyplot as plt ############################################################################### homedir = os.path.expanduser("~") sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml/ml_functions') sys.path # import from GetMLData import * from SplitTTS import * from MultClfs_fi import * #%% # X,y = load_boston(return_X_y=True) # features = load_boston()['feature_names'] # X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=42) # rf = RandomForestRegressor(random_state=0) # rf.fit(X_train,y_train) # f_i = list(zip(features,rf.feature_importances_)) # f_i.sort(key = lambda x : x[1]) # plt.barh([x[0] for x in f_i],[x[1] for x in f_i]) # plt.show() #%% sel_cv = StratifiedKFold(n_splits = 10 , shuffle = True,**rs) #sel_cv = logo # sel_cv = RepeatedStratifiedKFold(n_splits = 5 # , n_repeats = 3 # , **rs) # param dict for getmldata() gene_model_paramD = {'data_combined_model' : False , 'use_or' : False , 'omit_all_genomic_features': False , 'write_maskfile' : False , 'write_outfile' : False } #df = getmldata(gene, drug, **gene_model_paramD) df = getmldata('pncA', 'pyrazinamide', **gene_model_paramD) df2 = split_tts(df , data_type = 'actual' , split_type = '70_30' , oversampling = False , dst_colname = 'dst' , target_colname = 'dst_mode' , include_gene_name = True , random_state = 42 # default ) all(df2['X'].columns.isin(['gene_name'])) # should be False fooD = MultClfs_fi (input_df = df2['X'] , target = df2['y'] , sel_cv = sel_cv , run_blind_test = True , blind_test_df = df2['X_bts'] , blind_test_target = df2['y_bts'] , tts_split_type = '70_30' , var_type = 'mixed' , resampling_type = 'none' # default ) for k, v in fooD.items(): print('\nModel:', k , '\nTRAIN MCC:', fooD[k]['test_mcc'] , '\nBTS MCC:' , fooD[k]['bts_mcc'] , '\nDIFF:',fooD[k]['bts_mcc'] - fooD[k]['test_mcc'] )