diff --git a/UQ_LR.py b/UQ_LR.py deleted file mode 100644 index 9f20f32..0000000 --- a/UQ_LR.py +++ /dev/null @@ -1,207 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Mon May 16 05:59:12 2022 - -@author: tanu -""" -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Tue Mar 15 11:09:50 2022 - -@author: tanu -""" -#%% Import libs -import numpy as np -import pandas as pd -from sklearn.model_selection import GridSearchCV -from sklearn import datasets -from sklearn.ensemble import ExtraTreesClassifier -from sklearn.ensemble import RandomForestClassifier -from sklearn.ensemble import AdaBoostClassifier -from sklearn.ensemble import GradientBoostingClassifier -from sklearn.svm import SVC - -from sklearn.base import BaseEstimator -from sklearn.naive_bayes import MultinomialNB -from sklearn.linear_model import SGDClassifier -from sklearn.pipeline import Pipeline -from sklearn.model_selection import GridSearchCV -from sklearn.linear_model import LogisticRegression -from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder -from xgboost import XGBClassifier -rs = {'random_state': 42} -njobs = {'n_jobs': 10} -#%% Get train-test split and scoring functions -# X_train, X_test, y_train, y_test = train_test_split(num_df_wtgt[numerical_FN] -# , num_df_wtgt['mutation_class'] -# , test_size = 0.33 -# , random_state = 2 -# , shuffle = True -# , stratify = num_df_wtgt['mutation_class']) - -y.to_frame().value_counts().plot(kind = 'bar') -blind_test_df['dst_mode'].to_frame().value_counts().plot(kind = 'bar') - -scoring_fn = ({'accuracy' : make_scorer(accuracy_score) - , 'fscore' : make_scorer(f1_score) - , 'mcc' : make_scorer(matthews_corrcoef) - , 'precision' : make_scorer(precision_score) - , 'recall' : make_scorer(recall_score) - , 'roc_auc' : make_scorer(roc_auc_score) - , 'jaccard' : make_scorer(jaccard_score) - }) - -mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)} -jacc_score_fn = {'jcc': make_scorer(jaccard_score)} - -#%% Logistic Regression + hyperparam: BaseEstimator: ClfSwitcher() -class ClfSwitcher(BaseEstimator): - def __init__( - self, - estimator = SGDClassifier(), - ): - """ - A Custom BaseEstimator that can switch between classifiers. - :param estimator: sklearn object - The classifier - """ - self.estimator = estimator - - def fit(self, X, y=None, **kwargs): - self.estimator.fit(X, y) - return self - - def predict(self, X, y=None): - return self.estimator.predict(X) - - def predict_proba(self, X): - return self.estimator.predict_proba(X) - - def score(self, X, y): - return self.estimator.score(X, y) - -parameters = [ - { - 'clf__estimator': [LogisticRegression(**rs)], - #'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], - 'clf__estimator__C': np.logspace(0, 4, 10), - 'clf__estimator__penalty': ['none', 'l1', 'l2', 'elasticnet'], - 'clf__estimator__max_iter': list(range(100,800,100)), - 'clf__estimator__solver': ['saga'] - }, - { - 'clf__estimator': [LogisticRegression(**rs)], - #'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], - 'clf__estimator__C': np.logspace(0, 4, 10), - 'clf__estimator__penalty': ['l2', 'none'], - 'clf__estimator__max_iter': list(range(100,800,100)), - 'clf__estimator__solver': ['newton-cg', 'lbfgs', 'sag'] - }, - { - 'clf__estimator': [LogisticRegression(**rs)], - #'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], - 'clf__estimator__C': np.logspace(0, 4, 10), - 'clf__estimator__penalty': ['l1', 'l2'], - 'clf__estimator__max_iter': list(range(100,800,100)), - 'clf__estimator__solver': ['liblinear'] - } - -] - -# Create pipeline -pipeline = Pipeline([ - ('pre', MinMaxScaler()), - ('clf', ClfSwitcher()), -]) - -# Grid search i.e hyperparameter tuning and refitting on mcc -gscv_lr = GridSearchCV(pipeline - , parameters - #, scoring = 'f1', refit = 'f1' - , scoring = mcc_score_fn, refit = 'mcc' - , cv = skf_cv - , **njobs - , return_train_score = False - , verbose = 3) - -# Fit -gscv_lr_fit = gscv_lr.fit(X, y) -gscv_lr_fit_be_mod = gscv_lr_fit.best_params_ -gscv_lr_fit_be_res = gscv_lr_fit.cv_results_ - -print('Best model:\n', gscv_lr_fit_be_mod) -print('Best models score:\n', gscv_lr_fit.best_score_, ':' , round(gscv_lr_fit.best_score_, 2)) - -#print('\nMean test score from fit results:', round(mean(gscv_lr_fit_be_res['mean_test_mcc']),2)) -print('\nMean test score from fit results:', round(np.nanmean(gscv_lr_fit_be_res['mean_test_mcc']),2)) - - -###################################### -# Blind test -###################################### -# See how it does on the BLIND test -#print('\nBlind test score, mcc:', )) - -test_predict = gscv_lr_fit.predict(X_bts) -print(test_predict) -print(np.array(y_bts)) -y_btsf = np.array(y_bts) - -print(accuracy_score(y_bts, test_predict)) -print(matthews_corrcoef(y_bts, test_predict)) - -# create a dict with all scores -lr_bts_dict = {#'best_model': list(gscv_lr_fit_be_mod.items()) - 'bts_fscore':None - , 'bts_mcc':None - , 'bts_precision':None - , 'bts_recall':None - , 'bts_accuracy':None - , 'bts_roc_auc':None - , 'bts_jaccard':None } -lr_bts_dict -lr_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2) -lr_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2) -lr_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2) -lr_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2) -lr_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2) -lr_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2) -lr_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2) -lr_bts_dict - -# Create a df from dict with all scores -pd.DataFrame.from_dict(lr_bts_dict, orient = 'index', columns = 'best_model') - -lr_bts_df = pd.DataFrame.from_dict(lr_bts_dict,orient = 'index') -lr_bts_df.columns = ['Logistic_Regression'] -print(lr_bts_df) - -# d2 = {'best_model_params': lis(gscv_lr_fit_be_mod.items() )} -# d2 -# def Merge(dict1, dict2): -# res = {**dict1, **dict2} -# return res -# d3 = Merge(d2, lr_bts_dict) -# d3 - -# Create df with best model params -model_params = pd.Series(['best_model_params', list(gscv_lr_fit_be_mod.items() )]) -model_params_df = model_params.to_frame() -model_params_df -model_params_df.columns = ['Logistic_Regression'] -model_params_df.columns - -# Combine the df of scores and the best model params -lr_bts_df.columns -lr_output = pd.concat([model_params_df, lr_bts_df], axis = 0) -lr_output - -# Format the combined df -# Drop the best_model_params row from lr_output -lr_df = lr_output.drop([0], axis = 0) -lr_df - -#FIXME: tidy the index of the formatted df - -############################################################################### diff --git a/UQ_RF.py b/UQ_RF.py index ca3c292..36d9a50 100644 --- a/UQ_RF.py +++ b/UQ_RF.py @@ -39,8 +39,8 @@ parameters = [ 'clf__estimator__max_depth': [4, 6, 8, 10, 12, 16, 20, None] , 'clf__estimator__class_weight':['balanced','balanced_subsample'] , 'clf__estimator__n_estimators': [10, 25, 50, 100] - , 'clf__estimator__criterion': ['gini', 'entropy']#, 'log_loss'] - #, 'clf__estimator__max_features': ['auto', 'sqrt'] + , 'clf__estimator__criterion': ['gini', 'entropy', 'log_loss'] + , 'clf__estimator__max_features': ['sqrt', 'log2', None] #deafult is sqrt , 'clf__estimator__min_samples_leaf': [1, 2, 3, 4, 5, 10] , 'clf__estimator__min_samples_split': [2, 5, 15, 20] } diff --git a/UQ_pnca_ML.py b/UQ_pnca_ML.py index 48ab9a0..76b5dfa 100644 --- a/UQ_pnca_ML.py +++ b/UQ_pnca_ML.py @@ -207,8 +207,8 @@ X_genomicFN = ['maf' # , 'or_fisher' # , 'pval_fisher' #, 'lineage' - , 'lineage_count_all' - , 'lineage_count_unique' + #, 'lineage_count_all' + #, 'lineage_count_unique' ] #%% Construct numerical and categorical column names @@ -256,7 +256,7 @@ all_df_wtgt = training_df[numerical_FN + categorical_FN + ['dst_mode']] all_df_wtgt.shape #%%================================================================ #%% Apply ML -#TODO: Apply oversampling! +#TODO: A #%% Data #X = all_df_wtgt[numerical_FN+categorical_FN] @@ -272,16 +272,16 @@ X_bts_wt = blind_test_df[numerical_FN + ['dst_mode']] # Quick check (X['ligand_affinity_change']==0).sum() == (X['ligand_distance']>10).sum() #%% MultClassPipeSKFCV: function call() -mm_skf_scoresD = MultClassPipeSKFCV(input_df = X - , target = y - , var_type = 'numerical' - , skf_cv = skf_cv) +# mm_skf_scoresD = MultClassPipeSKFCV(input_df = X +# , target = y +# , var_type = 'numerical' +# , skf_cv = skf_cv) -mm_skf_scores_df_all = pd.DataFrame(mm_skf_scoresD) -mm_skf_scores_df_all -mm_skf_scores_df_test = mm_skf_scores_df_all.filter(like='test_', axis=0) -mm_skf_scores_df_train = mm_skf_scores_df_all.filter(like='train_', axis=0) # helps to see if you trust the results -print(mm_skf_scores_df_train) -print(mm_skf_scores_df_test) +# mm_skf_scores_df_all = pd.DataFrame(mm_skf_scoresD) +# mm_skf_scores_df_all +# mm_skf_scores_df_test = mm_skf_scores_df_all.filter(like='test_', axis=0) +# mm_skf_scores_df_train = mm_skf_scores_df_all.filter(like='train_', axis=0) # helps to see if you trust the results +# print(mm_skf_scores_df_train) +# print(mm_skf_scores_df_test)