import pandas as pd import os, sys import numpy as np from sklearn.datasets import load_boston from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import train_test_split from sklearn.feature_selection import RFECV import matplotlib.pyplot as plt ############################################################################### homedir = os.path.expanduser("~") sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml/ml_functions') sys.path # import from GetMLData import * from SplitTTS import * from MultClfs import * from MultClfs_noBTS import * #%% rs = {'random_state': 42} skf_cv = StratifiedKFold(n_splits = 10 , shuffle = True,**rs) #sel_cv = logo # sel_cv = RepeatedStratifiedKFold(n_splits = 5 # , n_repeats = 3 # , **rs) # param dict for getmldata() gene_model_paramD = {'data_combined_model' : False , 'use_or' : False , 'omit_all_genomic_features': False , 'write_maskfile' : False , 'write_outfile' : False } #df = getmldata(gene, drug, **gene_model_paramD) #df = getmldata('pncA', 'pyrazinamide', **gene_model_paramD) df = getmldata('embB', 'ethambutol' , **gene_model_paramD) #df = getmldata('katG', 'isoniazid' , **gene_model_paramD) #df = getmldata('rpoB', 'rifampicin' , **gene_model_paramD) #df = getmldata('gid' , 'streptomycin' , **gene_model_paramD) #df = getmldata('alr' , 'cycloserine' , **gene_model_paramD) all(df.columns.isin(['gene_name'])) # should be False spl_type = '70_30' #spl_type = '80_20' #spl_type = 'sl' #data_type = "actual" data_type = "complete" df2 = split_tts(df , data_type = data_type , split_type = spl_type , oversampling = True , dst_colname = 'dst' , target_colname = 'dst_mode' , include_gene_name = True , random_state = 42 # default ) all(df2['X'].columns.isin(['gene_name'])) # should be False df['dst'].value_counts() df['dst'].isna().sum() df['dst_mode'].value_counts() len(df) Counter(df2['y']) Counter(df2['y_bts']) #%% Run Multiple models fooD = MultModelsCl(input_df = df2['X'] , target = df2['y'] , sel_cv = skf_cv , run_blind_test = True , blind_test_df = df2['X_bts'] , blind_test_target = df2['y_bts'] , tts_split_type = spl_type , resampling_type = 'XXXX' # default , var_type = ['mixed'] , scale_numeric = ['min_max'] , return_formatted_output = False ) for k, v in fooD.items(): print('\nModel:', k , '\nTRAIN MCC:', fooD[k]['test_mcc'] , '\nBTS MCC:' , fooD[k]['bts_mcc'] , '\nDIFF:',fooD[k]['bts_mcc'] - fooD[k]['test_mcc'] ) for k, v in fooD.items(): print('\nModel:', k , '\nTRAIN ACCURACY:', fooD[k]['test_accuracy'] , '\nBTS ACCURACY:' , fooD[k]['bts_accuracy'] , '\nDIFF:',fooD[k]['bts_accuracy'] - fooD[k]['test_accuracy'] ) #%% CHECK SCALING embb_df = getmldata('embB', 'ethambutol' , **combined_model_paramD) all(embb_df.columns.isin(['gene_name'])) # should be False scaler = MinMaxScaler(feature_range=(-1, 1)) bar = embb_df[['vdwclashes_rr', 'electro_rr']] bar_df1 = scaler.fit_transform(bar) bar_df1 = pd.DataFrame(bar_df1) bar_df1.rename(columns = {0:'vdw_scaled', 1: 'ele_scaled'}, inplace = True) bar2 = pd.concat([bar, bar_df1], axis = 1) scaler2 = StandardScaler() baz = embb_df[['vdwclashes_rr', 'electro_rr']] baz_df1 = scaler2.fit_transform(baz) baz_df1 = pd.DataFrame(baz_df1) baz_df1.rename(columns = {0:'vdw_scaled', 1: 'ele_scaled'}, inplace = True) baz2 = pd.concat([baz, baz_df1], axis = 1) a = pd.concat([bar2, baz2], axis = 1) #%% test added split_types i.e none_with_bts and none_only spl_type = 'none_only' spl_type = 'none_with_bts' spl_type = 'rt' #data_type = "actual" data_type = "complete" df2 = split_tts(df , data_type = data_type # only works with complete despite what you set to , split_type = spl_type , oversampling = True , dst_colname = 'dst' , target_colname = 'dst_mode' , include_gene_name = True , random_state = 42 # default ) all(df2['X'].columns.isin(['gene_name'])) # should be False import pandas as pd from sklearn.utils import all_estimators all_clfs = all_estimators(type_filter="classifier") df = pd.DataFrame (all_clfs, columns = ['classifier_name', 'classifier_fn']) df.to_csv("Model_names_ALL.csv") #%% TEST different CV Thresholds for split_type = NONE Counter(df2['y']) Counter(df2['y_bts']) spl_type = 'none' data_type = "complete" df2 = split_tts(df , data_type = data_type , split_type = spl_type , oversampling = True , dst_colname = 'dst' , target_colname = 'dst_mode' , include_gene_name = True , random_state = 42 # default ) fooD = MultModelsCl_noBTS(input_df = df2['X'] , target = df2['y'] , skf_cv_threshold = 10 # IMP to change , tts_split_type = spl_type , resampling_type = 'XXXX' # default , add_cm = True # adds confusion matrix based on cross_val_predict , add_yn = True # adds target var class numbers , var_type = ['mixed'] , scale_numeric = ['min_max'] , random_state = 42 , n_jobs = os.cpu_count() , return_formatted_output = False ) for k, v in fooD.items(): print('\nModel:', k , '\nTRAIN MCC:', fooD[k]['test_mcc'] ) # formatted df foo_df3 = MultModelsCl_noBTS(input_df = df2['X'] , target = df2['y'] , skf_cv_threshold = 5 # IMP to change , tts_split_type = spl_type , resampling_type = 'XXXX' # default , add_cm = True # adds confusion matrix based on cross_val_predict , add_yn = True # adds target var class numbers , var_type = ['mixed'] , scale_numeric = ['min_max'] , random_state = 42 , n_jobs = os.cpu_count() , return_formatted_output = True ) dfs_combine_wf = [foo_df, foo_df2, foo_df3] common_cols_wf = list(set.intersection(*(set(df.columns) for df in dfs_combine_wf))) print('\nCombinig', len(dfs_combine_wf), 'using pd.concat by row ~ rowbind' , '\nChecking Dims of df to combine:' , '\nDim of CV:', scoresDF_CV.shape , '\nDim of BT:', scoresDF_BT.shape) #print(scoresDF_CV) #print(scoresDF_BT) dfs_nrows_wf = [] for df in dfs_combine_wf: dfs_nrows_wf = dfs_nrows_wf + [len(df)] dfs_nrows_wf = max(dfs_nrows_wf) dfs_ncols_wf = [] for df in dfs_combine_wf: dfs_ncols_wf = dfs_ncols_wf + [len(df.columns)] dfs_ncols_wf = max(dfs_ncols_wf) print(dfs_ncols_wf) expected_nrows_wf = len(dfs_combine_wf) * dfs_nrows_wf expected_ncols_wf = dfs_ncols_wf if len(common_cols_wf) == dfs_ncols_wf : combined_baseline_wf = pd.concat([df[common_cols_wf] for df in dfs_combine_wf], ignore_index=False) print('\nConcatenating dfs with different resampling methods [WF]:' , '\nSplit type:', spl_type , '\nNo. of dfs combining:', len(dfs_combine_wf)) if len(combined_baseline_wf) == expected_nrows_wf and len(combined_baseline_wf.columns) == expected_ncols_wf: print('\nPASS:', len(dfs_combine_wf), 'dfs successfully combined' , '\nnrows in combined_df_wf:', len(combined_baseline_wf) , '\nncols in combined_df_wf:', len(combined_baseline_wf.columns)) else: print('\nFAIL: concatenating failed' , '\nExpected nrows:', expected_nrows_wf , '\nGot:', len(combined_baseline_wf) , '\nExpected ncols:', expected_ncols_wf , '\nGot:', len(combined_baseline_wf.columns)) sys.exit('\nFIRST IF FAILS')