From 8d831f3613fc73b98d59d52e28fa5631df55ad1c Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Tue, 5 Jul 2022 22:47:13 +0100 Subject: [PATCH] added different scaling options --- scripts/ml/ml_functions/MultClfs.py | 74 +++++++++++++------ scripts/ml/ml_functions/ml_data_combined.py | 2 +- .../ml/ml_functions/test_func_singlegene.py | 54 +++++++++++--- 3 files changed, 99 insertions(+), 31 deletions(-) diff --git a/scripts/ml/ml_functions/MultClfs.py b/scripts/ml/ml_functions/MultClfs.py index 688caf3..290c06a 100755 --- a/scripts/ml/ml_functions/MultClfs.py +++ b/scripts/ml/ml_functions/MultClfs.py @@ -142,7 +142,9 @@ scoreBT_mapD = {'bts_mcc' : 'MCC' # Run Multiple Classifiers ############################ # Multiple Classification - Model Pipeline -def MultModelsCl(input_df, target, skf_cv +def MultModelsCl(input_df, target + #, skf_cv + , sel_cv , blind_test_df , blind_test_target , tts_split_type @@ -150,7 +152,8 @@ def MultModelsCl(input_df, target, skf_cv , resampling_type = 'none' # default , add_cm = True # adds confusion matrix based on cross_val_predict , add_yn = True # adds target var class numbers - , var_type = ['numerical', 'categorical','mixed'] + , var_type = ['numerical', 'categorical','mixed'] + , scale_numeric = ['min_max', 'std', 'min_max_neg', 'none'] , run_blind_test = True , return_formatted_output = True): @@ -182,24 +185,52 @@ def MultModelsCl(input_df, target, skf_cv #====================================================== # Determine preprocessing steps ~ var_type #====================================================== - if var_type == 'numerical': - t = [('num', MinMaxScaler(), numerical_ix)] + + # if var_type == 'numerical': + # t = [('num', MinMaxScaler(), numerical_ix)] + # if var_type == 'categorical': + # t = [('cat', OneHotEncoder(), categorical_ix)] + + # # if var_type == 'mixed': + # # t = [('num', MinMaxScaler(), numerical_ix) + # # , ('cat', OneHotEncoder(), categorical_ix) ] + + # if var_type == 'mixed': + # t = [('cat', OneHotEncoder(), categorical_ix) ] + if type(var_type) == list: + var_type = str(var_type[0]) + else: + var_type = var_type + + if var_type in ['numerical','mixed']: + if scale_numeric == ['none']: + t = [('cat', OneHotEncoder(), categorical_ix)] + if scale_numeric != ['none']: + if scale_numeric == ['min_max']: + scaler = MinMaxScaler() + if scale_numeric == ['min_max_neg']: + scaler = MinMaxScaler(feature_range=(-1, 1)) + if scale_numeric == ['std']: + scaler = StandardScaler() + + t = [('num', scaler, numerical_ix) + , ('cat', OneHotEncoder(), categorical_ix)] + + if var_type == 'categorical': t = [('cat', OneHotEncoder(), categorical_ix)] - - if var_type == 'mixed': - t = [('num', MinMaxScaler(), numerical_ix) - , ('cat', OneHotEncoder(), categorical_ix) ] + col_transform = ColumnTransformer(transformers = t , remainder='passthrough') + #====================================================== # Specify multiple Classification Models #====================================================== models = [('AdaBoost Classifier' , AdaBoostClassifier(**rs) ) - , ('Bagging Classifier' , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True) ) + , ('Bagging Classifier' , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True, verbose = 3, n_estimators = 100) ) , ('Decision Tree' , DecisionTreeClassifier(**rs) ) , ('Extra Tree' , ExtraTreeClassifier(**rs) ) , ('Extra Trees' , ExtraTreesClassifier(**rs) ) @@ -211,18 +242,18 @@ def MultModelsCl(input_df, target, skf_cv , ('Logistic Regression' , LogisticRegression(**rs) ) , ('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs)) , ('MLP' , MLPClassifier(max_iter = 500, **rs) ) - , ('Multinomial' , MultinomialNB() ) + #, ('Multinomial' , MultinomialNB() ) , ('Naive Bayes' , BernoulliNB() ) , ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) ) , ('QDA' , QuadraticDiscriminantAnalysis() ) , ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000, **njobs ) ) - , ('Random Forest2' , RandomForestClassifier(min_samples_leaf = 5 - , n_estimators = 1000 - , bootstrap = True - , oob_score = True - , **njobs - , **rs - , max_features = 'auto') ) + # , ('Random Forest2' , RandomForestClassifier(min_samples_leaf = 5 + # , n_estimators = 1000 + # , bootstrap = True + # , oob_score = True + # , **njobs + # , **rs + # , max_features = 'auto') ) , ('Ridge Classifier' , RidgeClassifier(**rs) ) , ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) ) , ('SVC' , SVC(**rs) ) @@ -254,7 +285,7 @@ def MultModelsCl(input_df, target, skf_cv skf_cv_modD = cross_validate(model_pipeline , input_df , target - , cv = skf_cv + , cv = sel_cv , scoring = scoring_fn , return_train_score = True) #============================== @@ -283,7 +314,7 @@ def MultModelsCl(input_df, target, skf_cv cmD = {} # Calculate cm - y_pred = cross_val_predict(model_pipeline, input_df, target, cv = skf_cv, **njobs) + y_pred = cross_val_predict(model_pipeline, input_df, target, cv = sel_cv, **njobs) #_tn, _fp, _fn, _tp = confusion_matrix(y_pred, y).ravel() # internally tn, fp, fn, tp = confusion_matrix(y_pred, target).ravel() @@ -334,8 +365,9 @@ def MultModelsCl(input_df, target, skf_cv bts_mcc_score = round(matthews_corrcoef(blind_test_target, bts_predict),2) print('\nMCC on Blind test:' , bts_mcc_score) - print('\nAccuracy on Blind test:', round(accuracy_score(blind_test_target, bts_predict),2)) - + #print('\nAccuracy on Blind test:', round(accuracy_score(blind_test_target, bts_predict),2)) + print('\nMCC on Training:' , mm_skf_scoresD[model_name]['test_mcc'] ) + mm_skf_scoresD[model_name]['bts_mcc'] = bts_mcc_score mm_skf_scoresD[model_name]['bts_fscore'] = round(f1_score(blind_test_target, bts_predict),2) mm_skf_scoresD[model_name]['bts_precision'] = round(precision_score(blind_test_target, bts_predict),2) diff --git a/scripts/ml/ml_functions/ml_data_combined.py b/scripts/ml/ml_functions/ml_data_combined.py index 57e2295..7dca351 100644 --- a/scripts/ml/ml_functions/ml_data_combined.py +++ b/scripts/ml/ml_functions/ml_data_combined.py @@ -26,7 +26,7 @@ from GetMLData import * combined_model_paramD = {'data_combined_model' : True , 'use_or' : False , 'omit_all_genomic_features': False - , 'write_maskfile' : False + , 'write_maskfile' : False # true once for writing and checking , 'write_outfile' : False } pnca_df = getmldata('pncA', 'pyrazinamide' , **combined_model_paramD) diff --git a/scripts/ml/ml_functions/test_func_singlegene.py b/scripts/ml/ml_functions/test_func_singlegene.py index d483514..26a0095 100644 --- a/scripts/ml/ml_functions/test_func_singlegene.py +++ b/scripts/ml/ml_functions/test_func_singlegene.py @@ -14,7 +14,8 @@ sys.path # import from GetMLData import * from SplitTTS import * -from MultClfs_fi import * +#from MultClfs_fi import * +from MultClfs import * #%% # X,y = load_boston(return_X_y=True) @@ -33,7 +34,7 @@ from MultClfs_fi import * #%% -sel_cv = StratifiedKFold(n_splits = 10 +skf_cv = StratifiedKFold(n_splits = 10 , shuffle = True,**rs) #sel_cv = logo # sel_cv = RepeatedStratifiedKFold(n_splits = 5 @@ -48,10 +49,21 @@ gene_model_paramD = {'data_combined_model' : False #df = getmldata(gene, drug, **gene_model_paramD) df = getmldata('pncA', 'pyrazinamide', **gene_model_paramD) +df = getmldata('embB', 'ethambutol' , **gene_model_paramD) +df = getmldata('katG', 'isoniazid' , **gene_model_paramD) +df = getmldata('rpoB', 'rifampicin' , **gene_model_paramD) +df = getmldata('gid' , 'streptomycin' , **gene_model_paramD) +#df = getmldata('alr' , 'cycloserine' , **combined_model_paramD) +all(df.columns.isin(['gene_name'])) # should be False + + +spl_type = '70_30' +spl_type = '80_20' +spl_type = 'sl' df2 = split_tts(df , data_type = 'actual' - , split_type = '70_30' + , split_type = spl_type , oversampling = False , dst_colname = 'dst' , target_colname = 'dst_mode' @@ -61,19 +73,43 @@ df2 = split_tts(df all(df2['X'].columns.isin(['gene_name'])) # should be False -fooD = MultClfs_fi (input_df = df2['X'] +fooD = MultModelsCl(input_df = df2['X'] , target = df2['y'] - , sel_cv = sel_cv + , sel_cv = skf_cv , run_blind_test = True , blind_test_df = df2['X_bts'] , blind_test_target = df2['y_bts'] - , tts_split_type = '70_30' - , var_type = 'mixed' + , tts_split_type = spl_type , resampling_type = 'none' # default -) + , var_type = ['mixed'] + , scale_numeric = ['min_max_neg'] + , return_formatted_output = False + + ) for k, v in fooD.items(): print('\nModel:', k , '\nTRAIN MCC:', fooD[k]['test_mcc'] , '\nBTS MCC:' , fooD[k]['bts_mcc'] - , '\nDIFF:',fooD[k]['bts_mcc'] - fooD[k]['test_mcc'] ) \ No newline at end of file + , '\nDIFF:',fooD[k]['bts_mcc'] - fooD[k]['test_mcc'] ) + +#%% CHECK SCALING +embb_df = getmldata('embB', 'ethambutol' , **combined_model_paramD) +all(embb_df.columns.isin(['gene_name'])) # should be False + +scaler = MinMaxScaler(feature_range=(-1, 1)) +bar = embb_df[['vdwclashes_rr', 'electro_rr']] +bar_df1 = scaler.fit_transform(bar) +bar_df1 = pd.DataFrame(bar_df1) +bar_df1.rename(columns = {0:'vdw_scaled', 1: 'ele_scaled'}, inplace = True) +bar2 = pd.concat([bar, bar_df1], axis = 1) + + +scaler2 = StandardScaler() +baz = embb_df[['vdwclashes_rr', 'electro_rr']] +baz_df1 = scaler2.fit_transform(baz) +baz_df1 = pd.DataFrame(baz_df1) +baz_df1.rename(columns = {0:'vdw_scaled', 1: 'ele_scaled'}, inplace = True) +baz2 = pd.concat([baz, baz_df1], axis = 1) + +a = pd.concat([bar2, baz2], axis = 1)