saving work with scripts for feature selection

This commit is contained in:
Tanushree Tunstall 2022-05-19 08:30:18 +01:00
parent a9dc3c43e5
commit fa0f5e5b39
3 changed files with 15 additions and 222 deletions

207
UQ_LR.py
View file

@ -1,207 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon May 16 05:59:12 2022
@author: tanu
"""
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 15 11:09:50 2022
@author: tanu
"""
#%% Import libs
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn import datasets
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.base import BaseEstimator
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from xgboost import XGBClassifier
rs = {'random_state': 42}
njobs = {'n_jobs': 10}
#%% Get train-test split and scoring functions
# X_train, X_test, y_train, y_test = train_test_split(num_df_wtgt[numerical_FN]
# , num_df_wtgt['mutation_class']
# , test_size = 0.33
# , random_state = 2
# , shuffle = True
# , stratify = num_df_wtgt['mutation_class'])
y.to_frame().value_counts().plot(kind = 'bar')
blind_test_df['dst_mode'].to_frame().value_counts().plot(kind = 'bar')
scoring_fn = ({'accuracy' : make_scorer(accuracy_score)
, 'fscore' : make_scorer(f1_score)
, 'mcc' : make_scorer(matthews_corrcoef)
, 'precision' : make_scorer(precision_score)
, 'recall' : make_scorer(recall_score)
, 'roc_auc' : make_scorer(roc_auc_score)
, 'jaccard' : make_scorer(jaccard_score)
})
mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)}
jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
#%% Logistic Regression + hyperparam: BaseEstimator: ClfSwitcher()
class ClfSwitcher(BaseEstimator):
def __init__(
self,
estimator = SGDClassifier(),
):
"""
A Custom BaseEstimator that can switch between classifiers.
:param estimator: sklearn object - The classifier
"""
self.estimator = estimator
def fit(self, X, y=None, **kwargs):
self.estimator.fit(X, y)
return self
def predict(self, X, y=None):
return self.estimator.predict(X)
def predict_proba(self, X):
return self.estimator.predict_proba(X)
def score(self, X, y):
return self.estimator.score(X, y)
parameters = [
{
'clf__estimator': [LogisticRegression(**rs)],
#'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
'clf__estimator__C': np.logspace(0, 4, 10),
'clf__estimator__penalty': ['none', 'l1', 'l2', 'elasticnet'],
'clf__estimator__max_iter': list(range(100,800,100)),
'clf__estimator__solver': ['saga']
},
{
'clf__estimator': [LogisticRegression(**rs)],
#'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
'clf__estimator__C': np.logspace(0, 4, 10),
'clf__estimator__penalty': ['l2', 'none'],
'clf__estimator__max_iter': list(range(100,800,100)),
'clf__estimator__solver': ['newton-cg', 'lbfgs', 'sag']
},
{
'clf__estimator': [LogisticRegression(**rs)],
#'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
'clf__estimator__C': np.logspace(0, 4, 10),
'clf__estimator__penalty': ['l1', 'l2'],
'clf__estimator__max_iter': list(range(100,800,100)),
'clf__estimator__solver': ['liblinear']
}
]
# Create pipeline
pipeline = Pipeline([
('pre', MinMaxScaler()),
('clf', ClfSwitcher()),
])
# Grid search i.e hyperparameter tuning and refitting on mcc
gscv_lr = GridSearchCV(pipeline
, parameters
#, scoring = 'f1', refit = 'f1'
, scoring = mcc_score_fn, refit = 'mcc'
, cv = skf_cv
, **njobs
, return_train_score = False
, verbose = 3)
# Fit
gscv_lr_fit = gscv_lr.fit(X, y)
gscv_lr_fit_be_mod = gscv_lr_fit.best_params_
gscv_lr_fit_be_res = gscv_lr_fit.cv_results_
print('Best model:\n', gscv_lr_fit_be_mod)
print('Best models score:\n', gscv_lr_fit.best_score_, ':' , round(gscv_lr_fit.best_score_, 2))
#print('\nMean test score from fit results:', round(mean(gscv_lr_fit_be_res['mean_test_mcc']),2))
print('\nMean test score from fit results:', round(np.nanmean(gscv_lr_fit_be_res['mean_test_mcc']),2))
######################################
# Blind test
######################################
# See how it does on the BLIND test
#print('\nBlind test score, mcc:', ))
test_predict = gscv_lr_fit.predict(X_bts)
print(test_predict)
print(np.array(y_bts))
y_btsf = np.array(y_bts)
print(accuracy_score(y_bts, test_predict))
print(matthews_corrcoef(y_bts, test_predict))
# create a dict with all scores
lr_bts_dict = {#'best_model': list(gscv_lr_fit_be_mod.items())
'bts_fscore':None
, 'bts_mcc':None
, 'bts_precision':None
, 'bts_recall':None
, 'bts_accuracy':None
, 'bts_roc_auc':None
, 'bts_jaccard':None }
lr_bts_dict
lr_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2)
lr_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2)
lr_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2)
lr_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2)
lr_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2)
lr_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2)
lr_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2)
lr_bts_dict
# Create a df from dict with all scores
pd.DataFrame.from_dict(lr_bts_dict, orient = 'index', columns = 'best_model')
lr_bts_df = pd.DataFrame.from_dict(lr_bts_dict,orient = 'index')
lr_bts_df.columns = ['Logistic_Regression']
print(lr_bts_df)
# d2 = {'best_model_params': lis(gscv_lr_fit_be_mod.items() )}
# d2
# def Merge(dict1, dict2):
# res = {**dict1, **dict2}
# return res
# d3 = Merge(d2, lr_bts_dict)
# d3
# Create df with best model params
model_params = pd.Series(['best_model_params', list(gscv_lr_fit_be_mod.items() )])
model_params_df = model_params.to_frame()
model_params_df
model_params_df.columns = ['Logistic_Regression']
model_params_df.columns
# Combine the df of scores and the best model params
lr_bts_df.columns
lr_output = pd.concat([model_params_df, lr_bts_df], axis = 0)
lr_output
# Format the combined df
# Drop the best_model_params row from lr_output
lr_df = lr_output.drop([0], axis = 0)
lr_df
#FIXME: tidy the index of the formatted df
###############################################################################

View file

@ -39,8 +39,8 @@ parameters = [
'clf__estimator__max_depth': [4, 6, 8, 10, 12, 16, 20, None]
, 'clf__estimator__class_weight':['balanced','balanced_subsample']
, 'clf__estimator__n_estimators': [10, 25, 50, 100]
, 'clf__estimator__criterion': ['gini', 'entropy']#, 'log_loss']
#, 'clf__estimator__max_features': ['auto', 'sqrt']
, 'clf__estimator__criterion': ['gini', 'entropy', 'log_loss']
, 'clf__estimator__max_features': ['sqrt', 'log2', None] #deafult is sqrt
, 'clf__estimator__min_samples_leaf': [1, 2, 3, 4, 5, 10]
, 'clf__estimator__min_samples_split': [2, 5, 15, 20]
}

View file

@ -207,8 +207,8 @@ X_genomicFN = ['maf'
# , 'or_fisher'
# , 'pval_fisher'
#, 'lineage'
, 'lineage_count_all'
, 'lineage_count_unique'
#, 'lineage_count_all'
#, 'lineage_count_unique'
]
#%% Construct numerical and categorical column names
@ -256,7 +256,7 @@ all_df_wtgt = training_df[numerical_FN + categorical_FN + ['dst_mode']]
all_df_wtgt.shape
#%%================================================================
#%% Apply ML
#TODO: Apply oversampling!
#TODO: A
#%% Data
#X = all_df_wtgt[numerical_FN+categorical_FN]
@ -272,16 +272,16 @@ X_bts_wt = blind_test_df[numerical_FN + ['dst_mode']]
# Quick check
(X['ligand_affinity_change']==0).sum() == (X['ligand_distance']>10).sum()
#%% MultClassPipeSKFCV: function call()
mm_skf_scoresD = MultClassPipeSKFCV(input_df = X
, target = y
, var_type = 'numerical'
, skf_cv = skf_cv)
# mm_skf_scoresD = MultClassPipeSKFCV(input_df = X
# , target = y
# , var_type = 'numerical'
# , skf_cv = skf_cv)
mm_skf_scores_df_all = pd.DataFrame(mm_skf_scoresD)
mm_skf_scores_df_all
mm_skf_scores_df_test = mm_skf_scores_df_all.filter(like='test_', axis=0)
mm_skf_scores_df_train = mm_skf_scores_df_all.filter(like='train_', axis=0) # helps to see if you trust the results
print(mm_skf_scores_df_train)
print(mm_skf_scores_df_test)
# mm_skf_scores_df_all = pd.DataFrame(mm_skf_scoresD)
# mm_skf_scores_df_all
# mm_skf_scores_df_test = mm_skf_scores_df_all.filter(like='test_', axis=0)
# mm_skf_scores_df_train = mm_skf_scores_df_all.filter(like='train_', axis=0) # helps to see if you trust the results
# print(mm_skf_scores_df_train)
# print(mm_skf_scores_df_test)