saving work with scripts for feature selection
This commit is contained in:
parent
a9dc3c43e5
commit
fa0f5e5b39
3 changed files with 15 additions and 222 deletions
207
UQ_LR.py
207
UQ_LR.py
|
@ -1,207 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
"""
|
|
||||||
Created on Mon May 16 05:59:12 2022
|
|
||||||
|
|
||||||
@author: tanu
|
|
||||||
"""
|
|
||||||
#!/usr/bin/env python3
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
"""
|
|
||||||
Created on Tue Mar 15 11:09:50 2022
|
|
||||||
|
|
||||||
@author: tanu
|
|
||||||
"""
|
|
||||||
#%% Import libs
|
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
|
||||||
from sklearn.model_selection import GridSearchCV
|
|
||||||
from sklearn import datasets
|
|
||||||
from sklearn.ensemble import ExtraTreesClassifier
|
|
||||||
from sklearn.ensemble import RandomForestClassifier
|
|
||||||
from sklearn.ensemble import AdaBoostClassifier
|
|
||||||
from sklearn.ensemble import GradientBoostingClassifier
|
|
||||||
from sklearn.svm import SVC
|
|
||||||
|
|
||||||
from sklearn.base import BaseEstimator
|
|
||||||
from sklearn.naive_bayes import MultinomialNB
|
|
||||||
from sklearn.linear_model import SGDClassifier
|
|
||||||
from sklearn.pipeline import Pipeline
|
|
||||||
from sklearn.model_selection import GridSearchCV
|
|
||||||
from sklearn.linear_model import LogisticRegression
|
|
||||||
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
|
|
||||||
from xgboost import XGBClassifier
|
|
||||||
rs = {'random_state': 42}
|
|
||||||
njobs = {'n_jobs': 10}
|
|
||||||
#%% Get train-test split and scoring functions
|
|
||||||
# X_train, X_test, y_train, y_test = train_test_split(num_df_wtgt[numerical_FN]
|
|
||||||
# , num_df_wtgt['mutation_class']
|
|
||||||
# , test_size = 0.33
|
|
||||||
# , random_state = 2
|
|
||||||
# , shuffle = True
|
|
||||||
# , stratify = num_df_wtgt['mutation_class'])
|
|
||||||
|
|
||||||
y.to_frame().value_counts().plot(kind = 'bar')
|
|
||||||
blind_test_df['dst_mode'].to_frame().value_counts().plot(kind = 'bar')
|
|
||||||
|
|
||||||
scoring_fn = ({'accuracy' : make_scorer(accuracy_score)
|
|
||||||
, 'fscore' : make_scorer(f1_score)
|
|
||||||
, 'mcc' : make_scorer(matthews_corrcoef)
|
|
||||||
, 'precision' : make_scorer(precision_score)
|
|
||||||
, 'recall' : make_scorer(recall_score)
|
|
||||||
, 'roc_auc' : make_scorer(roc_auc_score)
|
|
||||||
, 'jaccard' : make_scorer(jaccard_score)
|
|
||||||
})
|
|
||||||
|
|
||||||
mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)}
|
|
||||||
jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
|
|
||||||
|
|
||||||
#%% Logistic Regression + hyperparam: BaseEstimator: ClfSwitcher()
|
|
||||||
class ClfSwitcher(BaseEstimator):
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
estimator = SGDClassifier(),
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
A Custom BaseEstimator that can switch between classifiers.
|
|
||||||
:param estimator: sklearn object - The classifier
|
|
||||||
"""
|
|
||||||
self.estimator = estimator
|
|
||||||
|
|
||||||
def fit(self, X, y=None, **kwargs):
|
|
||||||
self.estimator.fit(X, y)
|
|
||||||
return self
|
|
||||||
|
|
||||||
def predict(self, X, y=None):
|
|
||||||
return self.estimator.predict(X)
|
|
||||||
|
|
||||||
def predict_proba(self, X):
|
|
||||||
return self.estimator.predict_proba(X)
|
|
||||||
|
|
||||||
def score(self, X, y):
|
|
||||||
return self.estimator.score(X, y)
|
|
||||||
|
|
||||||
parameters = [
|
|
||||||
{
|
|
||||||
'clf__estimator': [LogisticRegression(**rs)],
|
|
||||||
#'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
|
|
||||||
'clf__estimator__C': np.logspace(0, 4, 10),
|
|
||||||
'clf__estimator__penalty': ['none', 'l1', 'l2', 'elasticnet'],
|
|
||||||
'clf__estimator__max_iter': list(range(100,800,100)),
|
|
||||||
'clf__estimator__solver': ['saga']
|
|
||||||
},
|
|
||||||
{
|
|
||||||
'clf__estimator': [LogisticRegression(**rs)],
|
|
||||||
#'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
|
|
||||||
'clf__estimator__C': np.logspace(0, 4, 10),
|
|
||||||
'clf__estimator__penalty': ['l2', 'none'],
|
|
||||||
'clf__estimator__max_iter': list(range(100,800,100)),
|
|
||||||
'clf__estimator__solver': ['newton-cg', 'lbfgs', 'sag']
|
|
||||||
},
|
|
||||||
{
|
|
||||||
'clf__estimator': [LogisticRegression(**rs)],
|
|
||||||
#'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
|
|
||||||
'clf__estimator__C': np.logspace(0, 4, 10),
|
|
||||||
'clf__estimator__penalty': ['l1', 'l2'],
|
|
||||||
'clf__estimator__max_iter': list(range(100,800,100)),
|
|
||||||
'clf__estimator__solver': ['liblinear']
|
|
||||||
}
|
|
||||||
|
|
||||||
]
|
|
||||||
|
|
||||||
# Create pipeline
|
|
||||||
pipeline = Pipeline([
|
|
||||||
('pre', MinMaxScaler()),
|
|
||||||
('clf', ClfSwitcher()),
|
|
||||||
])
|
|
||||||
|
|
||||||
# Grid search i.e hyperparameter tuning and refitting on mcc
|
|
||||||
gscv_lr = GridSearchCV(pipeline
|
|
||||||
, parameters
|
|
||||||
#, scoring = 'f1', refit = 'f1'
|
|
||||||
, scoring = mcc_score_fn, refit = 'mcc'
|
|
||||||
, cv = skf_cv
|
|
||||||
, **njobs
|
|
||||||
, return_train_score = False
|
|
||||||
, verbose = 3)
|
|
||||||
|
|
||||||
# Fit
|
|
||||||
gscv_lr_fit = gscv_lr.fit(X, y)
|
|
||||||
gscv_lr_fit_be_mod = gscv_lr_fit.best_params_
|
|
||||||
gscv_lr_fit_be_res = gscv_lr_fit.cv_results_
|
|
||||||
|
|
||||||
print('Best model:\n', gscv_lr_fit_be_mod)
|
|
||||||
print('Best models score:\n', gscv_lr_fit.best_score_, ':' , round(gscv_lr_fit.best_score_, 2))
|
|
||||||
|
|
||||||
#print('\nMean test score from fit results:', round(mean(gscv_lr_fit_be_res['mean_test_mcc']),2))
|
|
||||||
print('\nMean test score from fit results:', round(np.nanmean(gscv_lr_fit_be_res['mean_test_mcc']),2))
|
|
||||||
|
|
||||||
|
|
||||||
######################################
|
|
||||||
# Blind test
|
|
||||||
######################################
|
|
||||||
# See how it does on the BLIND test
|
|
||||||
#print('\nBlind test score, mcc:', ))
|
|
||||||
|
|
||||||
test_predict = gscv_lr_fit.predict(X_bts)
|
|
||||||
print(test_predict)
|
|
||||||
print(np.array(y_bts))
|
|
||||||
y_btsf = np.array(y_bts)
|
|
||||||
|
|
||||||
print(accuracy_score(y_bts, test_predict))
|
|
||||||
print(matthews_corrcoef(y_bts, test_predict))
|
|
||||||
|
|
||||||
# create a dict with all scores
|
|
||||||
lr_bts_dict = {#'best_model': list(gscv_lr_fit_be_mod.items())
|
|
||||||
'bts_fscore':None
|
|
||||||
, 'bts_mcc':None
|
|
||||||
, 'bts_precision':None
|
|
||||||
, 'bts_recall':None
|
|
||||||
, 'bts_accuracy':None
|
|
||||||
, 'bts_roc_auc':None
|
|
||||||
, 'bts_jaccard':None }
|
|
||||||
lr_bts_dict
|
|
||||||
lr_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2)
|
|
||||||
lr_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2)
|
|
||||||
lr_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2)
|
|
||||||
lr_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2)
|
|
||||||
lr_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2)
|
|
||||||
lr_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2)
|
|
||||||
lr_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2)
|
|
||||||
lr_bts_dict
|
|
||||||
|
|
||||||
# Create a df from dict with all scores
|
|
||||||
pd.DataFrame.from_dict(lr_bts_dict, orient = 'index', columns = 'best_model')
|
|
||||||
|
|
||||||
lr_bts_df = pd.DataFrame.from_dict(lr_bts_dict,orient = 'index')
|
|
||||||
lr_bts_df.columns = ['Logistic_Regression']
|
|
||||||
print(lr_bts_df)
|
|
||||||
|
|
||||||
# d2 = {'best_model_params': lis(gscv_lr_fit_be_mod.items() )}
|
|
||||||
# d2
|
|
||||||
# def Merge(dict1, dict2):
|
|
||||||
# res = {**dict1, **dict2}
|
|
||||||
# return res
|
|
||||||
# d3 = Merge(d2, lr_bts_dict)
|
|
||||||
# d3
|
|
||||||
|
|
||||||
# Create df with best model params
|
|
||||||
model_params = pd.Series(['best_model_params', list(gscv_lr_fit_be_mod.items() )])
|
|
||||||
model_params_df = model_params.to_frame()
|
|
||||||
model_params_df
|
|
||||||
model_params_df.columns = ['Logistic_Regression']
|
|
||||||
model_params_df.columns
|
|
||||||
|
|
||||||
# Combine the df of scores and the best model params
|
|
||||||
lr_bts_df.columns
|
|
||||||
lr_output = pd.concat([model_params_df, lr_bts_df], axis = 0)
|
|
||||||
lr_output
|
|
||||||
|
|
||||||
# Format the combined df
|
|
||||||
# Drop the best_model_params row from lr_output
|
|
||||||
lr_df = lr_output.drop([0], axis = 0)
|
|
||||||
lr_df
|
|
||||||
|
|
||||||
#FIXME: tidy the index of the formatted df
|
|
||||||
|
|
||||||
###############################################################################
|
|
4
UQ_RF.py
4
UQ_RF.py
|
@ -39,8 +39,8 @@ parameters = [
|
||||||
'clf__estimator__max_depth': [4, 6, 8, 10, 12, 16, 20, None]
|
'clf__estimator__max_depth': [4, 6, 8, 10, 12, 16, 20, None]
|
||||||
, 'clf__estimator__class_weight':['balanced','balanced_subsample']
|
, 'clf__estimator__class_weight':['balanced','balanced_subsample']
|
||||||
, 'clf__estimator__n_estimators': [10, 25, 50, 100]
|
, 'clf__estimator__n_estimators': [10, 25, 50, 100]
|
||||||
, 'clf__estimator__criterion': ['gini', 'entropy']#, 'log_loss']
|
, 'clf__estimator__criterion': ['gini', 'entropy', 'log_loss']
|
||||||
#, 'clf__estimator__max_features': ['auto', 'sqrt']
|
, 'clf__estimator__max_features': ['sqrt', 'log2', None] #deafult is sqrt
|
||||||
, 'clf__estimator__min_samples_leaf': [1, 2, 3, 4, 5, 10]
|
, 'clf__estimator__min_samples_leaf': [1, 2, 3, 4, 5, 10]
|
||||||
, 'clf__estimator__min_samples_split': [2, 5, 15, 20]
|
, 'clf__estimator__min_samples_split': [2, 5, 15, 20]
|
||||||
}
|
}
|
||||||
|
|
|
@ -207,8 +207,8 @@ X_genomicFN = ['maf'
|
||||||
# , 'or_fisher'
|
# , 'or_fisher'
|
||||||
# , 'pval_fisher'
|
# , 'pval_fisher'
|
||||||
#, 'lineage'
|
#, 'lineage'
|
||||||
, 'lineage_count_all'
|
#, 'lineage_count_all'
|
||||||
, 'lineage_count_unique'
|
#, 'lineage_count_unique'
|
||||||
]
|
]
|
||||||
|
|
||||||
#%% Construct numerical and categorical column names
|
#%% Construct numerical and categorical column names
|
||||||
|
@ -256,7 +256,7 @@ all_df_wtgt = training_df[numerical_FN + categorical_FN + ['dst_mode']]
|
||||||
all_df_wtgt.shape
|
all_df_wtgt.shape
|
||||||
#%%================================================================
|
#%%================================================================
|
||||||
#%% Apply ML
|
#%% Apply ML
|
||||||
#TODO: Apply oversampling!
|
#TODO: A
|
||||||
|
|
||||||
#%% Data
|
#%% Data
|
||||||
#X = all_df_wtgt[numerical_FN+categorical_FN]
|
#X = all_df_wtgt[numerical_FN+categorical_FN]
|
||||||
|
@ -272,16 +272,16 @@ X_bts_wt = blind_test_df[numerical_FN + ['dst_mode']]
|
||||||
# Quick check
|
# Quick check
|
||||||
(X['ligand_affinity_change']==0).sum() == (X['ligand_distance']>10).sum()
|
(X['ligand_affinity_change']==0).sum() == (X['ligand_distance']>10).sum()
|
||||||
#%% MultClassPipeSKFCV: function call()
|
#%% MultClassPipeSKFCV: function call()
|
||||||
mm_skf_scoresD = MultClassPipeSKFCV(input_df = X
|
# mm_skf_scoresD = MultClassPipeSKFCV(input_df = X
|
||||||
, target = y
|
# , target = y
|
||||||
, var_type = 'numerical'
|
# , var_type = 'numerical'
|
||||||
, skf_cv = skf_cv)
|
# , skf_cv = skf_cv)
|
||||||
|
|
||||||
|
|
||||||
mm_skf_scores_df_all = pd.DataFrame(mm_skf_scoresD)
|
# mm_skf_scores_df_all = pd.DataFrame(mm_skf_scoresD)
|
||||||
mm_skf_scores_df_all
|
# mm_skf_scores_df_all
|
||||||
mm_skf_scores_df_test = mm_skf_scores_df_all.filter(like='test_', axis=0)
|
# mm_skf_scores_df_test = mm_skf_scores_df_all.filter(like='test_', axis=0)
|
||||||
mm_skf_scores_df_train = mm_skf_scores_df_all.filter(like='train_', axis=0) # helps to see if you trust the results
|
# mm_skf_scores_df_train = mm_skf_scores_df_all.filter(like='train_', axis=0) # helps to see if you trust the results
|
||||||
print(mm_skf_scores_df_train)
|
# print(mm_skf_scores_df_train)
|
||||||
print(mm_skf_scores_df_test)
|
# print(mm_skf_scores_df_test)
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue