saving work with scripts for feature selection

2022-05-19 08:30:18 +01:00 · 2022-05-19 08:30:18 +01:00 · fa0f5e5b39
commit fa0f5e5b39
parent a9dc3c43e5
3 changed files with 15 additions and 222 deletions
--- a/UQ_LR.py
+++ b/UQ_LR.py
@ -1,207 +0,0 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Mon May 16 05:59:12 2022
@author: tanu
 """
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Tue Mar 15 11:09:50 2022
@author: tanu
 """
 #%% Import libs
 import numpy as np
 import pandas as pd
 from sklearn.model_selection import GridSearchCV
 from sklearn import datasets
 from sklearn.ensemble import ExtraTreesClassifier
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.ensemble import AdaBoostClassifier
 from sklearn.ensemble import GradientBoostingClassifier
 from sklearn.svm import SVC
 from sklearn.base import BaseEstimator
 from sklearn.naive_bayes import MultinomialNB
 from sklearn.linear_model import SGDClassifier
 from sklearn.pipeline import Pipeline
 from sklearn.model_selection import GridSearchCV
 from sklearn.linear_model import LogisticRegression
 from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
 from xgboost import XGBClassifier
 rs = {'random_state': 42}
 njobs = {'n_jobs': 10}
 #%% Get train-test split and scoring functions
 # X_train, X_test, y_train, y_test = train_test_split(num_df_wtgt[numerical_FN]
 #                                                     , num_df_wtgt['mutation_class']
 #                                                     , test_size    = 0.33
 #                                                     , random_state = 2
 #                                                     , shuffle      = True
 #                                                     , stratify     = num_df_wtgt['mutation_class'])
 y.to_frame().value_counts().plot(kind = 'bar')
 blind_test_df['dst_mode'].to_frame().value_counts().plot(kind = 'bar')
 scoring_fn =  ({'accuracy'      : make_scorer(accuracy_score)
                 , 'fscore'     : make_scorer(f1_score)
                 , 'mcc'        : make_scorer(matthews_corrcoef)
                 ,  'precision' : make_scorer(precision_score)
                 ,  'recall'    : make_scorer(recall_score)
                 ,  'roc_auc'   : make_scorer(roc_auc_score)
                 ,  'jaccard'   : make_scorer(jaccard_score)
            })    
 mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)}
 jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
 #%% Logistic Regression + hyperparam: BaseEstimator: ClfSwitcher()
 class ClfSwitcher(BaseEstimator):
    def __init__(
        self, 
        estimator = SGDClassifier(),
    ):
        """
        A Custom BaseEstimator that can switch between classifiers.
        :param estimator: sklearn object - The classifier
        """ 
        self.estimator = estimator
    def fit(self, X, y=None, **kwargs):
        self.estimator.fit(X, y)
        return self
    def predict(self, X, y=None):
        return self.estimator.predict(X)
    def predict_proba(self, X):
        return self.estimator.predict_proba(X)
    def score(self, X, y):
        return self.estimator.score(X, y)
 parameters = [
    {
        'clf__estimator': [LogisticRegression(**rs)],
        #'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
        'clf__estimator__C': np.logspace(0, 4, 10),
        'clf__estimator__penalty': ['none', 'l1', 'l2', 'elasticnet'],
        'clf__estimator__max_iter': list(range(100,800,100)),
        'clf__estimator__solver': ['saga']
    },
    {
        'clf__estimator': [LogisticRegression(**rs)],
        #'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
        'clf__estimator__C': np.logspace(0, 4, 10),
        'clf__estimator__penalty': ['l2', 'none'],
        'clf__estimator__max_iter': list(range(100,800,100)),
        'clf__estimator__solver': ['newton-cg', 'lbfgs', 'sag']
    }, 
    {
        'clf__estimator': [LogisticRegression(**rs)],
        #'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
        'clf__estimator__C': np.logspace(0, 4, 10),
        'clf__estimator__penalty': ['l1', 'l2'],
        'clf__estimator__max_iter': list(range(100,800,100)),
        'clf__estimator__solver': ['liblinear']
    }
 ]    
 # Create pipeline
 pipeline = Pipeline([
    ('pre', MinMaxScaler()),
    ('clf', ClfSwitcher()),
 ])
 # Grid search i.e hyperparameter tuning and refitting on mcc
 gscv_lr = GridSearchCV(pipeline
                    , parameters
                    #, scoring = 'f1', refit = 'f1'
                    , scoring = mcc_score_fn, refit = 'mcc'
                    , cv = skf_cv
                    , **njobs
                    , return_train_score = False
                    , verbose = 3)
 # Fit 
 gscv_lr_fit = gscv_lr.fit(X, y)
 gscv_lr_fit_be_mod = gscv_lr_fit.best_params_
 gscv_lr_fit_be_res = gscv_lr_fit.cv_results_
 print('Best model:\n', gscv_lr_fit_be_mod)
 print('Best models score:\n', gscv_lr_fit.best_score_, ':' , round(gscv_lr_fit.best_score_, 2))
 #print('\nMean test score from fit results:', round(mean(gscv_lr_fit_be_res['mean_test_mcc']),2))
 print('\nMean test score from fit results:', round(np.nanmean(gscv_lr_fit_be_res['mean_test_mcc']),2))
 ######################################
 # Blind test
 ######################################
 # See how it does on the BLIND test
 #print('\nBlind test score, mcc:', )) 
 test_predict = gscv_lr_fit.predict(X_bts)
 print(test_predict)
 print(np.array(y_bts))
 y_btsf = np.array(y_bts)
 print(accuracy_score(y_bts, test_predict))
 print(matthews_corrcoef(y_bts, test_predict))
 # create a dict with all scores
 lr_bts_dict = {#'best_model': list(gscv_lr_fit_be_mod.items())
               'bts_fscore':None
               , 'bts_mcc':None
               , 'bts_precision':None
               , 'bts_recall':None
               , 'bts_accuracy':None
               , 'bts_roc_auc':None
               , 'bts_jaccard':None }
 lr_bts_dict
 lr_bts_dict['bts_fscore']    = round(f1_score(y_bts, test_predict),2)
 lr_bts_dict['bts_mcc']       = round(matthews_corrcoef(y_bts, test_predict),2)
 lr_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2)
 lr_bts_dict['bts_recall']    = round(recall_score(y_bts, test_predict),2)
 lr_bts_dict['bts_accuracy']  = round(accuracy_score(y_bts, test_predict),2)
 lr_bts_dict['bts_roc_auc']   = round(roc_auc_score(y_bts, test_predict),2)
 lr_bts_dict['bts_jaccard']   = round(jaccard_score(y_bts, test_predict),2)
 lr_bts_dict
 # Create a df from dict with all scores
 pd.DataFrame.from_dict(lr_bts_dict, orient = 'index', columns = 'best_model')
 lr_bts_df = pd.DataFrame.from_dict(lr_bts_dict,orient = 'index')
 lr_bts_df.columns = ['Logistic_Regression']
 print(lr_bts_df)
 # d2 = {'best_model_params': lis(gscv_lr_fit_be_mod.items() )}
 # d2
 # def Merge(dict1, dict2):
 #     res = {**dict1, **dict2}
 #     return res
 # d3 = Merge(d2, lr_bts_dict)
 # d3
 # Create df with best model params
 model_params = pd.Series(['best_model_params',  list(gscv_lr_fit_be_mod.items() )])
 model_params_df = model_params.to_frame()
 model_params_df
 model_params_df.columns = ['Logistic_Regression']
 model_params_df.columns
 # Combine the df of scores and the best model params
 lr_bts_df.columns
 lr_output = pd.concat([model_params_df, lr_bts_df], axis = 0)
 lr_output
 # Format the combined df
 # Drop the best_model_params row from lr_output
 lr_df = lr_output.drop([0], axis = 0)
 lr_df
 #FIXME: tidy the index of the formatted df
 ###############################################################################
--- a/UQ_RF.py
+++ b/UQ_RF.py
@ -39,8 +39,8 @@ parameters = [
        'clf__estimator__max_depth': [4, 6, 8, 10, 12, 16, 20, None]
        , 'clf__estimator__class_weight':['balanced','balanced_subsample']
        , 'clf__estimator__n_estimators': [10, 25, 50, 100]
-        , 'clf__estimator__criterion': ['gini', 'entropy']#, 'log_loss']
+        , 'clf__estimator__criterion': ['gini', 'entropy', 'log_loss']
-        #, 'clf__estimator__max_features': ['auto', 'sqrt']
+        , 'clf__estimator__max_features': ['sqrt', 'log2', None] #deafult is sqrt
        , 'clf__estimator__min_samples_leaf': [1, 2, 3, 4, 5, 10]
        , 'clf__estimator__min_samples_split': [2, 5, 15, 20]
        }
--- a/UQ_pnca_ML.py
+++ b/UQ_pnca_ML.py
@ -207,8 +207,8 @@ X_genomicFN =  ['maf'
 #            , 'or_fisher'
 #            , 'pval_fisher'
             #, 'lineage'
-             , 'lineage_count_all'
+             #, 'lineage_count_all'
-             , 'lineage_count_unique'
+             #, 'lineage_count_unique'
 ]
 #%% Construct numerical and categorical column names
@ -256,7 +256,7 @@ all_df_wtgt = training_df[numerical_FN + categorical_FN + ['dst_mode']]
 all_df_wtgt.shape
 #%%================================================================
 #%% Apply ML
-#TODO: Apply oversampling!
+#TODO: A
 #%% Data
 #X = all_df_wtgt[numerical_FN+categorical_FN]
@ -272,16 +272,16 @@ X_bts_wt = blind_test_df[numerical_FN + ['dst_mode']]
 # Quick check
 (X['ligand_affinity_change']==0).sum() == (X['ligand_distance']>10).sum()
 #%% MultClassPipeSKFCV: function call()
-mm_skf_scoresD = MultClassPipeSKFCV(input_df = X
+# mm_skf_scoresD = MultClassPipeSKFCV(input_df = X
-                                        , target = y
+#                                         , target = y
-                                        , var_type = 'numerical'
+#                                         , var_type = 'numerical'
-                                        , skf_cv = skf_cv)
+#                                         , skf_cv = skf_cv)
-mm_skf_scores_df_all = pd.DataFrame(mm_skf_scoresD)
+# mm_skf_scores_df_all = pd.DataFrame(mm_skf_scoresD)
-mm_skf_scores_df_all
+# mm_skf_scores_df_all
-mm_skf_scores_df_test = mm_skf_scores_df_all.filter(like='test_', axis=0)
+# mm_skf_scores_df_test = mm_skf_scores_df_all.filter(like='test_', axis=0)
-mm_skf_scores_df_train = mm_skf_scores_df_all.filter(like='train_', axis=0) # helps to see if you trust the results
+# mm_skf_scores_df_train = mm_skf_scores_df_all.filter(like='train_', axis=0) # helps to see if you trust the results
-print(mm_skf_scores_df_train)
+# print(mm_skf_scores_df_train)
-print(mm_skf_scores_df_test)
+# print(mm_skf_scores_df_test)