diff --git a/MultClassPipe3_CALL.py b/MultClassPipe3_CALL.py index a2d18c1..df4311f 100644 --- a/MultClassPipe3_CALL.py +++ b/MultClassPipe3_CALL.py @@ -21,6 +21,7 @@ mm_skf_scores_df_all = pd.DataFrame(mm_skf_scoresD) mm_skf_scores_df_all mm_skf_scores_df_test = mm_skf_scores_df_all.filter(like='test_', axis=0) mm_skf_scores_df_train = mm_skf_scores_df_all.filter(like='train_', axis=0) # helps to see if you trust the results +baseline_BT = mm_skf_scores_df_all.filter(like='bts_', axis=0) #%% mm_skf_scoresD2 = MultClassPipeSKFCV(input_df = X_sm @@ -29,28 +30,70 @@ mm_skf_scoresD2 = MultClassPipeSKFCV(input_df = X_sm , skf_cv = skf_cv) sm_all = pd.DataFrame(mm_skf_scoresD2) sm_df_CT = sm_all.filter(like='test_', axis=0) +sm_df_BT = sm_all.filter(like='bts_', axis=0) #%% mm_skf_scoresD3 = MultClassPipeSKFCV(input_df = X_ros , target = y_ros , var_type = 'mixed' - , skf_cv = skf_cv) + , skf_cv = rskf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) ros_all = pd.DataFrame(mm_skf_scoresD3) ros_CT = ros_all.filter(like='test_', axis=0) +ros_BT = ros_all.filter(like='bts_', axis=0) +#--------- combined +mm_skf_scoresD3v2 = MultClassPipeSKFCV(input_df = X_rouC + , target = y_rouC + , var_type = 'mixed' + , skf_cv = rskf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) +rouC_all = pd.DataFrame(mm_skf_scoresD3v2) +rouC_CT = ros_all.filter(like='test_', axis=0) +rouC_BT = ros_all.filter(like='bts_', axis=0) + #%% mm_skf_scoresD4 = MultClassPipeSKFCV(input_df = X_rus , target = y_rus - , var_type = 'mixed' - , skf_cv = skf_cv) + , var_type = 'numerical' + , skf_cv = rskf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) rus_all = pd.DataFrame(mm_skf_scoresD4) rus_CT = rus_all.filter(like='test_', axis=0) +rus_BT = rus_all.filter(like='bts_' , axis=0) #%% mm_skf_scoresD5 = MultClassPipeSKFCV(input_df = X_enn , target = y_enn - , var_type = 'mixed' - , skf_cv = skf_cv) + , var_type = 'numerical' + , skf_cv = rskf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) enn_all = pd.DataFrame(mm_skf_scoresD5) enn_CT = enn_all.filter(like='test_', axis=0) +enn_BT = enn_all.filter(like='bts_', axis=0) +#%% +mm_skf_scoresD6 = MultClassPipeSKFCV(input_df = X_renn + , target = y_renn + , var_type = 'numerical' + , skf_cv = rskf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) +renn_all = pd.DataFrame(mm_skf_scoresD6) +renn_CT = renn_all.filter(like='test_', axis=0) +renn_BT = renn_all.filter(like='bts_', axis=0) + +#%%: with categorical values + oversampling +mm_skf_scoresD7 = MultClassPipeSKFCV(input_df = X_smnc + , target = y_smnc + , var_type = 'mixed' + , skf_cv = rskf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) +smnc_all = pd.DataFrame(mm_skf_scoresD7) +smnc_CT = smnc_all.filter(like='test_', axis=0) +smnc_BT = smnc_all.filter(like='bts_', axis=0) diff --git a/UQ_MultClassPipe4.py b/UQ_MultClassPipe4.py index 634ebca..7ee53d5 100644 --- a/UQ_MultClassPipe4.py +++ b/UQ_MultClassPipe4.py @@ -76,8 +76,8 @@ from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder rs = {'random_state': 42} njobs = {'n_jobs': 10} -scoring_fn = ({ 'fscore' : make_scorer(f1_score) - , 'mcc' : make_scorer(matthews_corrcoef) +scoring_fn = ({ 'mcc' : make_scorer(matthews_corrcoef) + , 'fscore' : make_scorer(f1_score) , 'precision' : make_scorer(precision_score) , 'recall' : make_scorer(recall_score) , 'accuracy' : make_scorer(accuracy_score) @@ -87,7 +87,10 @@ scoring_fn = ({ 'fscore' : make_scorer(f1_score) # Multiple Classification - Model Pipeline -def MultClassPipeSKFCV(input_df, target, skf_cv, var_type = ['numerical', 'categorical','mixed']): +def MultClassPipeSKFCV(input_df, target, skf_cv + , blind_test_input_df + , blind_test_target + , var_type = ['numerical', 'categorical','mixed']): ''' @ param input_df: input features @@ -120,8 +123,8 @@ def MultClassPipeSKFCV(input_df, target, skf_cv, var_type = ['numerical', 'categ t = [('cat', OneHotEncoder(), categorical_ix)] if var_type == 'mixed': - t = [('cat', OneHotEncoder(), categorical_ix) - , ('num', MinMaxScaler(), numerical_ix)] + t = [('num', MinMaxScaler(), numerical_ix) + , ('cat', OneHotEncoder(), categorical_ix) ] col_transform = ColumnTransformer(transformers = t , remainder='passthrough') @@ -137,7 +140,7 @@ def MultClassPipeSKFCV(input_df, target, skf_cv, var_type = ['numerical', 'categ rf = RandomForestClassifier(**rs, n_estimators = 1000 ) rf2 = RandomForestClassifier( min_samples_leaf = 5 - , n_estimators = 100 #10 + , n_estimators = 1000 , bootstrap = True , oob_score = True , **njobs @@ -158,16 +161,16 @@ def MultClassPipeSKFCV(input_df, target, skf_cv, var_type = ['numerical', 'categ , ('K-Nearest Neighbors', knn) , ('SVM' , svm) , ('MLP' , mlp) - # , ('Decision Tree' , dt) - # , ('Extra Trees' , et) - # , ('Random Forest' , rf) - # , ('Naive Bayes' , nb) - # , ('Random Forest2' , rf2) - # , ('XGBoost' , xgb) - # , ('LDA' , lda) - # , ('MultinomialNB' , mnb) - # , ('PassiveAggresive' , pa) - # , ('StochasticGDescent' , sgd) + , ('Decision Tree' , dt) + , ('Extra Trees' , et) + , ('Random Forest' , rf) + , ('Naive Bayes' , nb) + , ('Random Forest2' , rf2) + , ('XGBoost' , xgb) + , ('LDA' , lda) + , ('MultinomialNB' , mnb) + , ('PassiveAggresive' , pa) + , ('StochasticGDescent' , sgd) ] mm_skf_scoresD = {} @@ -196,48 +199,41 @@ def MultClassPipeSKFCV(input_df, target, skf_cv, var_type = ['numerical', 'categ #pp.pprint(mm_skf_scoresD) #return(mm_skf_scoresD) - - - - #%% #========================= # Blind test: BTS results #========================= # Build the final results with all scores for a feature selected model - #bts_predict = gscv_fs.predict(X_bts) + #bts_predict = gscv_fs.predict(blind_test_input_df) model_pipeline.fit(input_df, target) - bts_predict = model_pipeline.predict(X_bts) + bts_predict = model_pipeline.predict(blind_test_input_df) - print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, bts_predict),2)) - print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, bts_predict),2)) - bts_mcc_score = round(matthews_corrcoef(y_bts, bts_predict),2) + bts_mcc_score = round(matthews_corrcoef(blind_test_target, bts_predict),2) + print('\nMCC on Blind test:' , bts_mcc_score) + print('\nAccuracy on Blind test:', round(accuracy_score(blind_test_target, bts_predict),2)) # Diff b/w train and bts test scores # train_test_diff = train_bscore - bts_mcc_score # print('\nDiff b/w train and blind test score (MCC):', train_test_diff) - # create a dict with all scores - lr_btsD = { 'model_name': model_name - , 'bts_mcc':None - , 'bts_fscore':None - , 'bts_precision':None - , 'bts_recall':None - , 'bts_accuracy':None - , 'bts_roc_auc':None - , 'bts_jaccard':None} + # # create a dict with all scores + # lr_btsD = { 'model_name': model_name + # , 'bts_mcc':None + # , 'bts_fscore':None + # , 'bts_precision':None + # , 'bts_recall':None + # , 'bts_accuracy':None + # , 'bts_roc_auc':None + # , 'bts_jaccard':None} - lr_btsD - lr_btsD['bts_mcc'] = bts_mcc_score - lr_btsD['bts_fscore'] = round(f1_score(y_bts, bts_predict),2) - lr_btsD['bts_precision'] = round(precision_score(y_bts, bts_predict),2) - lr_btsD['bts_recall'] = round(recall_score(y_bts, bts_predict),2) - lr_btsD['bts_accuracy'] = round(accuracy_score(y_bts, bts_predict),2) - lr_btsD['bts_roc_auc'] = round(roc_auc_score(y_bts, bts_predict),2) - lr_btsD['bts_jaccard'] = round(jaccard_score(y_bts, bts_predict),2) - lr_btsD - - return(lr_btsD) + mm_skf_scoresD[model_name]['bts_mcc'] = bts_mcc_score + mm_skf_scoresD[model_name]['bts_fscore'] = round(f1_score(blind_test_target, bts_predict),2) + mm_skf_scoresD[model_name]['bts_precision'] = round(precision_score(blind_test_target, bts_predict),2) + mm_skf_scoresD[model_name]['bts_recall'] = round(recall_score(blind_test_target, bts_predict),2) + mm_skf_scoresD[model_name]['bts_accuracy'] = round(accuracy_score(blind_test_target, bts_predict),2) + mm_skf_scoresD[model_name]['bts_roc_auc'] = round(roc_auc_score(blind_test_target, bts_predict),2) + mm_skf_scoresD[model_name]['bts_jaccard'] = round(jaccard_score(blind_test_target, bts_predict),2) + return(mm_skf_scoresD) diff --git a/UQ_pnca_ML.py b/UQ_pnca_ML.py index 7d5e4fe..9ba28e3 100644 --- a/UQ_pnca_ML.py +++ b/UQ_pnca_ML.py @@ -81,7 +81,11 @@ from sklearn.model_selection import cross_validate, cross_val_score from sklearn.model_selection import RepeatedStratifiedKFold from sklearn.ensemble import AdaBoostClassifier from imblearn.combine import SMOTEENN +from imblearn.combine import SMOTETomek + +from imblearn.over_sampling import SMOTENC from imblearn.under_sampling import EditedNearestNeighbours +from imblearn.under_sampling import RepeatedEditedNearestNeighbours from sklearn.model_selection import GridSearchCV @@ -117,7 +121,8 @@ os.chdir(homedir + "/git/ML_AI_training/") #from MultClassPipe import MultClassPipeline from MultClassPipe2 import MultClassPipeline2 from loopity_loop import MultClassPipeSKFLoop -from MultClassPipe3 import MultClassPipeSKFCV +#from MultClassPipe3 import MultClassPipeSKFCV +from UQ_MultClassPipe4 import MultClassPipeSKFCV gene = 'pncA' drug = 'pyrazinamide' @@ -285,9 +290,9 @@ all_df_wtgt.shape #------ # X #------ -#X = all_df_wtgt[numerical_FN + categorical_FN] # training data ALL +X = all_df_wtgt[numerical_FN + categorical_FN] # training data ALL X_bts = blind_test_df[numerical_FN + categorical_FN] # blind test data ALL -X = all_df_wtgt[numerical_FN] # training numerical only +#X = all_df_wtgt[numerical_FN] # training numerical only #X_bts = blind_test_df[numerical_FN] # blind test data numerical #------ diff --git a/uq_ml_models_FS/scriptfsycm.py b/uq_ml_models_FS/scriptfsycm.py index 4125c12..331f191 100644 --- a/uq_ml_models_FS/scriptfsycm.py +++ b/uq_ml_models_FS/scriptfsycm.py @@ -27,22 +27,56 @@ from sklearn.model_selection import train_test_split, cross_validate, cross_val_ # Metric from sklearn.metrics import mean_squared_error, make_scorer, roc_auc_score, f1_score, matthews_corrcoef, accuracy_score, balanced_accuracy_score, confusion_matrix, classification_report -def run_all_ML(input_pd, target_label): +#def run_all_ML(input_pd, target_label, bts_input, bts_target, var_type): +def run_all_ML(input_pd, target_label, preprocess = True, var_type = 'numerical'): + #y = input_pd[target_label] #X = input_pd.drop(target_label,axis=1) y = target_label X = input_pd + # determine categorical and numerical features + numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns + numerical_ix + categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns + categorical_ix + + # Determine preprocessing steps ~ var_type + if var_type == 'numerical': + t = [('num', MinMaxScaler(), numerical_ix)] + + if var_type == 'categorical': + t = [('cat', OneHotEncoder(), categorical_ix)] + if var_type == 'mixed': + t = [('num', MinMaxScaler(), numerical_ix) + , ('cat', OneHotEncoder(), categorical_ix)] + + col_transform = ColumnTransformer(transformers = t + , remainder='passthrough') result_pd = pd.DataFrame() for name, algorithm in all_estimators(type_filter="classifier"): try: estmator = algorithm() temp_pd = pd.DataFrame() temp_cm = pd.DataFrame() - + + # orig pipe = Pipeline([ - ("model", algorithm()) + ("model" , algorithm()) ]) + + # turn on and off preprocessing + if preprocess == True: + pipe = Pipeline([ + ('prep' , col_transform), + ("model" , algorithm()) + ]) + else: + pipe = Pipeline([ + ("model" , algorithm()) + ]) + + y_pred = cross_val_predict(pipe, X, y, cv = 10, n_jobs=10) _mcc = round(matthews_corrcoef(y_pred, y), 3) _bacc = round(balanced_accuracy_score(y_pred, y), 3)