import pandas as pd import os, sys import numpy as np from sklearn.datasets import load_boston from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import train_test_split from sklearn.feature_selection import RFECV import matplotlib.pyplot as plt ############################################################################### homedir = os.path.expanduser("~") sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml/ml_functions') sys.path # import from GetMLData import * from SplitTTS import * #from MultClfs import * from MultClfs_SIMPLE import * #%% skf_cv = StratifiedKFold(n_splits = 10 , shuffle = True,**rs) #sel_cv = logo # sel_cv = RepeatedStratifiedKFold(n_splits = 5 # , n_repeats = 3 # , **rs) # param dict for getmldata() gene_model_paramD = {'data_combined_model' : False , 'use_or' : False , 'omit_all_genomic_features': False , 'write_maskfile' : False , 'write_outfile' : False } #df = getmldata(gene, drug, **gene_model_paramD) df = getmldata('pncA', 'pyrazinamide', **gene_model_paramD) df = getmldata('embB', 'ethambutol' , **gene_model_paramD) df = getmldata('katG', 'isoniazid' , **gene_model_paramD) df = getmldata('rpoB', 'rifampicin' , **gene_model_paramD) df = getmldata('gid' , 'streptomycin' , **gene_model_paramD) #df = getmldata('alr' , 'cycloserine' , **combined_model_paramD) all(df.columns.isin(['gene_name'])) # should be False spl_type = '70_30' #spl_type = '80_20' #spl_type = 'sl' #data_type = "actual" data_type = "complete" df2 = split_tts(df , data_type = data_type , split_type = spl_type , oversampling = True , dst_colname = 'dst' , target_colname = 'dst_mode' , include_gene_name = True , random_state = 42 # default ) all(df2['X'].columns.isin(['gene_name'])) # should be False df['dst'].value_counts() df['dst'].isna().sum() df['dst_mode'].value_counts() len(df) Counter(df2['y']) Counter(df2['y_bts']) fooD = MultModelsCl(input_df = df2['X_ros'] , target = df2['y_ros'] , sel_cv = skf_cv , run_blind_test = True , blind_test_df = df2['X_bts'] , blind_test_target = df2['y_bts'] , tts_split_type = spl_type , resampling_type = 'XXXX' # default , var_type = ['mixed'] , scale_numeric = ['min_max'] , return_formatted_output = False ) for k, v in fooD.items(): print('\nModel:', k , '\nTRAIN MCC:', fooD[k]['test_mcc'] , '\nBTS MCC:' , fooD[k]['bts_mcc'] , '\nDIFF:',fooD[k]['bts_mcc'] - fooD[k]['test_mcc'] ) #%% CHECK SCALING embb_df = getmldata('embB', 'ethambutol' , **combined_model_paramD) all(embb_df.columns.isin(['gene_name'])) # should be False scaler = MinMaxScaler(feature_range=(-1, 1)) bar = embb_df[['vdwclashes_rr', 'electro_rr']] bar_df1 = scaler.fit_transform(bar) bar_df1 = pd.DataFrame(bar_df1) bar_df1.rename(columns = {0:'vdw_scaled', 1: 'ele_scaled'}, inplace = True) bar2 = pd.concat([bar, bar_df1], axis = 1) scaler2 = StandardScaler() baz = embb_df[['vdwclashes_rr', 'electro_rr']] baz_df1 = scaler2.fit_transform(baz) baz_df1 = pd.DataFrame(baz_df1) baz_df1.rename(columns = {0:'vdw_scaled', 1: 'ele_scaled'}, inplace = True) baz2 = pd.concat([baz, baz_df1], axis = 1) a = pd.concat([bar2, baz2], axis = 1) #%% test added split_types i.e none_with_bts and none_only spl_type = 'none_with_bts' spl_type = 'none_only' #data_type = "actual" data_type = "complete" df2 = split_tts(df , data_type = data_type # only works with complete despite what you set to , split_type = spl_type , oversampling = True , dst_colname = 'dst' , target_colname = 'dst_mode' , include_gene_name = True , random_state = 42 # default ) all(df2['X'].columns.isin(['gene_name'])) # should be False