import pandas as pd import os, sys import numpy as np from sklearn.datasets import load_boston from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import train_test_split from sklearn.feature_selection import RFECV import matplotlib.pyplot as plt ############################################################################### homedir = os.path.expanduser("~") sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml/ml_functions') sys.path # import from GetMLData import * from SplitTTS import * #from MultClfs_fi import * from MultClfs import * #%% # X,y = load_boston(return_X_y=True) # features = load_boston()['feature_names'] # X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=42) # rf = RandomForestRegressor(random_state=0) # rf.fit(X_train,y_train) # f_i = list(zip(features,rf.feature_importances_)) # f_i.sort(key = lambda x : x[1]) # plt.barh([x[0] for x in f_i],[x[1] for x in f_i]) # plt.show() #%% skf_cv = StratifiedKFold(n_splits = 10 , shuffle = True,**rs) #sel_cv = logo # sel_cv = RepeatedStratifiedKFold(n_splits = 5 # , n_repeats = 3 # , **rs) # param dict for getmldata() gene_model_paramD = {'data_combined_model' : False , 'use_or' : False , 'omit_all_genomic_features': False , 'write_maskfile' : False , 'write_outfile' : False } #df = getmldata(gene, drug, **gene_model_paramD) df = getmldata('pncA', 'pyrazinamide', **gene_model_paramD) df = getmldata('embB', 'ethambutol' , **gene_model_paramD) df = getmldata('katG', 'isoniazid' , **gene_model_paramD) df = getmldata('rpoB', 'rifampicin' , **gene_model_paramD) df = getmldata('gid' , 'streptomycin' , **gene_model_paramD) #df = getmldata('alr' , 'cycloserine' , **combined_model_paramD) all(df.columns.isin(['gene_name'])) # should be False spl_type = '70_30' spl_type = '80_20' spl_type = 'sl' df2 = split_tts(df , data_type = 'actual' , split_type = spl_type , oversampling = False , dst_colname = 'dst' , target_colname = 'dst_mode' , include_gene_name = True , random_state = 42 # default ) all(df2['X'].columns.isin(['gene_name'])) # should be False fooD = MultModelsCl(input_df = df2['X'] , target = df2['y'] , sel_cv = skf_cv , run_blind_test = True , blind_test_df = df2['X_bts'] , blind_test_target = df2['y_bts'] , tts_split_type = spl_type , resampling_type = 'none' # default , var_type = ['mixed'] , scale_numeric = ['min_max'] , return_formatted_output = False ) for k, v in fooD.items(): print('\nModel:', k , '\nTRAIN MCC:', fooD[k]['test_mcc'] , '\nBTS MCC:' , fooD[k]['bts_mcc'] , '\nDIFF:',fooD[k]['bts_mcc'] - fooD[k]['test_mcc'] ) #%% CHECK SCALING embb_df = getmldata('embB', 'ethambutol' , **combined_model_paramD) all(embb_df.columns.isin(['gene_name'])) # should be False scaler = MinMaxScaler(feature_range=(-1, 1)) bar = embb_df[['vdwclashes_rr', 'electro_rr']] bar_df1 = scaler.fit_transform(bar) bar_df1 = pd.DataFrame(bar_df1) bar_df1.rename(columns = {0:'vdw_scaled', 1: 'ele_scaled'}, inplace = True) bar2 = pd.concat([bar, bar_df1], axis = 1) scaler2 = StandardScaler() baz = embb_df[['vdwclashes_rr', 'electro_rr']] baz_df1 = scaler2.fit_transform(baz) baz_df1 = pd.DataFrame(baz_df1) baz_df1.rename(columns = {0:'vdw_scaled', 1: 'ele_scaled'}, inplace = True) baz2 = pd.concat([baz, baz_df1], axis = 1) a = pd.concat([bar2, baz2], axis = 1)