diff --git a/scripts/ml/MultModelsCl.py b/scripts/ml/MultModelsCl.py index dfcd87c..6ed37cd 100755 --- a/scripts/ml/MultModelsCl.py +++ b/scripts/ml/MultModelsCl.py @@ -137,95 +137,76 @@ def MultModelsCl(input_df, target, skf_cv col_transform = ColumnTransformer(transformers = t , remainder='passthrough') - # Specify multiple Classification models - lr = LogisticRegression(**rs) - lrcv = LogisticRegressionCV(**rs) - gnb = GaussianNB() - nb = BernoulliNB() - knn = KNeighborsClassifier() - svc = SVC(**rs) - mlp = MLPClassifier(max_iter = 500, **rs) - dt = DecisionTreeClassifier(**rs) - ets = ExtraTreesClassifier(**rs) - et = ExtraTreeClassifier(**rs) - rf = RandomForestClassifier(**rs, n_estimators = 1000 ) - rf2 = RandomForestClassifier( - min_samples_leaf = 5 - , n_estimators = 1000 - , bootstrap = True - , oob_score = True - , **njobs - , **rs - , max_features = 'auto') - xgb = XGBClassifier(**rs, verbosity = 0, use_label_encoder =False) - - lda = LinearDiscriminantAnalysis() - - mnb = MultinomialNB() - - pa = PassiveAggressiveClassifier(**rs, **njobs) - - sgd = SGDClassifier(**rs, **njobs) - - abc = AdaBoostClassifier(**rs) - bc = BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True) - gpc = GaussianProcessClassifier(**rs) - gbc = GradientBoostingClassifier(**rs) - qda = QuadraticDiscriminantAnalysis() - rc = RidgeClassifier(**rs) - rccv = RidgeClassifierCV(cv = 10) - - models = [('Logistic Regression' , lr) - , ('Logistic RegressionCV' , lrcv) - , ('Gaussian NB' , gnb) - , ('Naive Bayes' , nb) - , ('K-Nearest Neighbors' , knn) - , ('SVC' , svc) - , ('MLP' , mlp) - , ('Decision Tree' , dt) - , ('Extra Trees' , ets) - , ('Extra Tree' , et) - , ('Random Forest' , rf) - , ('Random Forest2' , rf2) - , ('XGBoost' , xgb) - , ('LDA' , lda) - , ('Multinomial' , mnb) - , ('Passive Aggresive' , pa) - , ('Stochastic GDescent' , sgd) - , ('AdaBoost Classifier' , abc) - , ('Bagging Classifier' , bc) - , ('Gaussian Process' , gpc) - , ('Gradient Boosting' , gbc) - , ('QDA' , qda) - , ('Ridge Classifier' , rc) - , ('Ridge ClassifierCV' , rccv) + # Specify multiple Classification models + models = [('Logistic Regression' , LogisticRegression(**rs) ) + , ('Logistic RegressionCV' , LogisticRegressionCV(**rs) ) + , ('Gaussian NB' , GaussianNB() ) + , ('Naive Bayes' , BernoulliNB() ) + , ('K-Nearest Neighbors' , KNeighborsClassifier() ) + , ('SVC' , SVC(**rs) ) + , ('MLP' , MLPClassifier(max_iter = 500, **rs) ) + , ('Decision Tree' , DecisionTreeClassifier(**rs) ) + , ('Extra Trees' , ExtraTreesClassifier(**rs) ) + , ('Extra Tree' , ExtraTreeClassifier(**rs) ) + , ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000 ) ) + , ('Random Forest2' , RandomForestClassifier(min_samples_leaf = 5 + , n_estimators = 1000 + , bootstrap = True + , oob_score = True + , **njobs + , **rs + , max_features = 'auto') ) + , ('XGBoost' , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False) ) + , ('LDA' , LinearDiscriminantAnalysis() ) + , ('Multinomial' , MultinomialNB() ) + , ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) ) + , ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) ) + , ('AdaBoost Classifier' , AdaBoostClassifier(**rs) ) + , ('Bagging Classifier' , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True) ) + , ('Gaussian Process' , GaussianProcessClassifier(**rs) ) + , ('Gradient Boosting' , GradientBoostingClassifier(**rs) ) + , ('QDA' , QuadraticDiscriminantAnalysis() ) + , ('Ridge Classifier' , RidgeClassifier(**rs) ) + , ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 10) ) ] mm_skf_scoresD = {} - - for model_name, model_fn in models: - print('\nModel_name:', model_name - , '\nModel func:' , model_fn - , '\nList of models:', models) + + print('\n==============================================================\n' + , '\nRunning several classification models (n):', len(models) + ,'\nList of models:') + for m in models: + print(m) + print('\n================================================================\n') + index = 1 + for model_name, model_fn in models: + print('\nRunning classifier:', index + , '\nModel_name:' , model_name + , '\nModel func:' , model_fn) + index = index+1 + model_pipeline = Pipeline([ ('prep' , col_transform) , ('model' , model_fn)]) - print('Running model pipeline:', model_pipeline) - skf_cv_mod = cross_validate(model_pipeline + print('\nRunning model pipeline:', model_pipeline) + skf_cv_modD = cross_validate(model_pipeline , input_df , target , cv = skf_cv , scoring = scoring_fn - , return_train_score = True) + , return_train_score = True) + + #============================== + # Extract mean values for CV + #============================== mm_skf_scoresD[model_name] = {} - for key, value in skf_cv_mod.items(): + + for key, value in skf_cv_modD.items(): print('\nkey:', key, '\nvalue:', value) print('\nmean value:', mean(value)) mm_skf_scoresD[model_name][key] = round(mean(value),2) - #pp.pprint(mm_skf_scoresD) - #cvtrain_mcc = mm_skf_scoresD[model_name]['test_mcc'] #return(mm_skf_scoresD) #%% diff --git a/scripts/ml/MultModelsCl_dissected.py b/scripts/ml/MultModelsCl_dissected.py index d8804af..cabef15 100644 --- a/scripts/ml/MultModelsCl_dissected.py +++ b/scripts/ml/MultModelsCl_dissected.py @@ -101,6 +101,9 @@ jacc_score_fn = {'jcc': make_scorer(jaccard_score)} def MultModelsCl_dissected(input_df, target, skf_cv , blind_test_input_df , blind_test_target + , add_cm = True # adds confusion matrix based on cross_val_predict + , add_yn = True # adds target var class numbers + , feature_groups = [''] , var_type = ['numerical', 'categorical','mixed']): ''' @@ -201,52 +204,88 @@ def MultModelsCl_dissected(input_df, target, skf_cv , scoring = scoring_fn , return_train_score = True) - #---------- - # check 1 - #---------- - foo_df = pd.DataFrame.from_dict(skf_cv_modD, orient ='index') - #foo_df = pd.DataFrame.from_dict(skf_cv_modD) - - #=================== - # Confusion matrix: Not an easy problem to solve! STILL DOING it, USE with caution + ####################################################################### + #====================================================== + # Option 1: Add confusion matrix from cross_val_predict + # Understand and USE with caution # cross_val_score, cross_val_predict, "Passing these predictions into an evaluation metric may not be a valid way to measure generalization performance. Results can differ from cross_validate and cross_val_score unless all tests sets have equal size and the metric decomposes over samples." # https://stackoverflow.com/questions/65645125/producing-a-confusion-matrix-with-cross-validate - #=================== - y_pred = cross_val_predict(model_pipeline, input_df, target, cv = 10, **njobs) - #_tn, _fp, _fn, _tp = confusion_matrix(y_pred, y).ravel() # internally - tn, fp, fn, tp = confusion_matrix(y_pred, target).ravel() - # create a dict of confusion matrix that can be appended to the one above - # cmD = {'TN' : np.array(tn) - # , 'FP': np.array(fp) - # , 'FN': np.array(fn) - # , 'TP': np.array(tp)} + #====================================================== + if add_cm: + + #----------------------------------------------------------- + # Initialise dict of Confusion Matrix (cm) + #----------------------------------------------------------- + cmD = {} + + # Calculate cm + y_pred = cross_val_predict(model_pipeline, input_df, target, cv = skf_cv, **njobs) + #_tn, _fp, _fn, _tp = confusion_matrix(y_pred, y).ravel() # internally + tn, fp, fn, tp = confusion_matrix(y_pred, target).ravel() + + # Build dict + + cmD = {'TN' : tn + , 'FP': fp + , 'FN': fn + , 'TP': tp} + #--------------------------------- + # Update cv dict with cmD and tbtD + #---------------------------------- + skf_cv_modD.update(cmD) + else: + skf_cv_modD = skf_cv_modD + ####################################################################### + #============================================= + # Option 2: Add targety numbers for data + #============================================= + if add_yn: + + #----------------------------------------------------------- + # Initialise dict of target numbers: training and blind (tbt) + #----------------------------------------------------------- + tbtD = {} - cmD = {'TN' : tn - , 'FP': fp - , 'FN': fn - , 'TP': tp} - skf_cv_modD.update(cmD) - - #---------- - # check 2 - #---------- - #foo2_df = pd.DataFrame.from_dict(skf_cv_modD, orient ='index') - #foo_df = pd.DataFrame.from_dict(skf_cv_modD) + # training y + tyn = Counter(target) + tyn_neg = tyn[0] + tyn_pos = tyn[1] + + # blind test y + btyn = Counter(blind_test_target) + btyn_neg = btyn[0] + btyn_pos = btyn[1] + + # Build dict + tbtD = {'trainingY_neg' : tyn_neg + , 'trainingY_pos' : tyn_pos + , 'blindY_neg' : btyn_neg + , 'blindY_pos' : btyn_pos} + + #--------------------------------- + # Update cv dict with cmD and tbtD + #---------------------------------- + skf_cv_modD.update(tbtD) + else: + skf_cv_modD = skf_cv_modD + ####################################################################### + #============================== + # Extract mean values for CV + #============================== mm_skf_scoresD[model_name] = {} + for key, value in skf_cv_modD.items(): print('\nkey:', key, '\nvalue:', value) print('\nmean value:', np.mean(value)) mm_skf_scoresD[model_name][key] = round(np.mean(value),2) - - #return(mm_skf_scoresD) #%% #========================= # Blind test: BTS results #========================= - # Build the final results with all scores for a feature selected model + # Build the final results with all scores for the model #bts_predict = gscv_fs.predict(blind_test_input_df) model_pipeline.fit(input_df, target) bts_predict = model_pipeline.predict(blind_test_input_df) @@ -255,22 +294,6 @@ def MultModelsCl_dissected(input_df, target, skf_cv print('\nMCC on Blind test:' , bts_mcc_score) print('\nAccuracy on Blind test:', round(accuracy_score(blind_test_target, bts_predict),2)) - # Diff b/w train and bts test scores - #train_test_diff_MCC = cvtrain_mcc - bts_mcc_score - # print('\nDiff b/w train and blind test score (MCC):', train_test_diff) - - - # # create a dict with all scores - # lr_btsD = { 'model_name': model_name - # , 'bts_mcc':None - # , 'bts_fscore':None - # , 'bts_precision':None - # , 'bts_recall':None - # , 'bts_accuracy':None - # , 'bts_roc_auc':None - # , 'bts_jaccard':None} - - mm_skf_scoresD[model_name]['bts_mcc'] = bts_mcc_score mm_skf_scoresD[model_name]['bts_fscore'] = round(f1_score(blind_test_target, bts_predict),2) mm_skf_scoresD[model_name]['bts_precision'] = round(precision_score(blind_test_target, bts_predict),2) diff --git a/scripts/ml/pnca_config_dissected.py b/scripts/ml/pnca_config_dissected.py index 3c3868d..a4b3873 100644 --- a/scripts/ml/pnca_config_dissected.py +++ b/scripts/ml/pnca_config_dissected.py @@ -104,29 +104,29 @@ else: print('\n#####################################################################\n') -############################################################################### -#================== -# Baseline models -#================== -mm_skf_scoresD = MultModelsCl(input_df = X - , target = y - , var_type = 'mixed' - , skf_cv = skf_cv - , blind_test_input_df = X_bts - , blind_test_target = y_bts) +# ############################################################################### +# #================== +# # Baseline models +# #================== +# mm_skf_scoresD = MultModelsCl(input_df = X +# , target = y +# , var_type = 'mixed' +# , skf_cv = skf_cv +# , blind_test_input_df = X_bts +# , blind_test_target = y_bts) -baseline_all = pd.DataFrame(mm_skf_scoresD) -baseline_all = baseline_all.T -#baseline_train = baseline_all.filter(like='train_', axis=1) -baseline_CT = baseline_all.filter(like='test_', axis=1) -baseline_CT.sort_values(by=['test_mcc'], ascending=False, inplace=True) +# baseline_all = pd.DataFrame(mm_skf_scoresD) +# baseline_all = baseline_all.T +# #baseline_train = baseline_all.filter(like='train_', axis=1) +# baseline_CT = baseline_all.filter(like='test_', axis=1) +# baseline_CT.sort_values(by=['test_mcc'], ascending=False, inplace=True) -baseline_BT = baseline_all.filter(like='bts_', axis=1) -baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) +# baseline_BT = baseline_all.filter(like='bts_', axis=1) +# baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) -# Write csv -baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv') -baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv') +# # Write csv +# baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv') +# baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv') # #%% SMOTE NC: Oversampling [Numerical + categorical]