made var names more meaniningful
This commit is contained in:
parent
e2b997badf
commit
6160d943f5
2 changed files with 85 additions and 8 deletions
47
my_data7.py
47
my_data7.py
|
@ -29,6 +29,9 @@ from sklearn.metrics import classification_report
|
|||
from sklearn.feature_selection import RFE
|
||||
from sklearn.feature_selection import RFECV
|
||||
|
||||
#############################
|
||||
# trying feature selection
|
||||
#############################
|
||||
#%%
|
||||
model= Pipeline(steps = [
|
||||
('pre', MinMaxScaler()),
|
||||
|
@ -103,6 +106,11 @@ https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.Sequ
|
|||
https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFECV.html
|
||||
https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.f_regression.html
|
||||
https://www.scikit-yb.org/en/latest/api/model_selection/rfecv.html
|
||||
|
||||
#GridSearchCV
|
||||
#ParameterGrid
|
||||
#RandomizedSearchCV
|
||||
#https://medium.com/analytics-vidhya/hyper-parameter-tuning-gridsearchcv-vs-randomizedsearchcv-499862e3ca5
|
||||
#%% RFE: Feature selection in classification
|
||||
# others in example
|
||||
# https://towardsdatascience.com/feature-selection-techniques-for-classification-and-python-tips-for-their-application-10c0ddd7918b
|
||||
|
@ -152,8 +160,8 @@ X_train, X_test, y_train, y_test = train_test_split(X_vars11,
|
|||
test_size = 0.33,
|
||||
random_state = 42)
|
||||
|
||||
model_logistic = LogisticRegression()
|
||||
sel_rfe_logistic = RFECV(estimator = model_logistic
|
||||
model_logistic2 = LogisticRegression()
|
||||
sel_rfe_logistic = RFECV(estimator = model_logistic2
|
||||
, cv = 10
|
||||
, step = 1)
|
||||
|
||||
|
@ -162,3 +170,38 @@ print(sel_rfe_logistic.get_support())
|
|||
X_train.columns
|
||||
|
||||
print(sel_rfe_logistic.ranking_)
|
||||
#%%
|
||||
# TODO: imputation
|
||||
# Find out the best way to impute values!
|
||||
#from sklearn.impute import SimpleImputer
|
||||
# https://towardsdatascience.com/whats-the-best-way-to-handle-nan-values-62d50f738fc
|
||||
#KNN and MICE
|
||||
my_df2 = pd.read_csv(infile_ml1)
|
||||
|
||||
genomicF = ['af'
|
||||
, 'beta_logistic'
|
||||
, 'or_logistic'
|
||||
, 'pval_logistic'
|
||||
, 'se_logistic'
|
||||
, 'zval_logistic'
|
||||
, 'ci_low_logistic'
|
||||
, 'ci_hi_logistic'
|
||||
, 'or_mychisq'
|
||||
, 'log10_or_mychisq'
|
||||
, 'or_fisher'
|
||||
, 'pval_fisher'
|
||||
, 'neglog_pval_fisher'
|
||||
, 'ci_low_fisher'
|
||||
, 'ci_hi_fisher'
|
||||
, 'est_chisq'
|
||||
, 'pval_chisq']
|
||||
|
||||
# X_genomicF = ['af'
|
||||
# , 'or_mychisq'
|
||||
# , 'or_logistic'
|
||||
# , 'or_fisher'
|
||||
# , 'pval_fisher']
|
||||
|
||||
my_df2[genomicF].isna().sum()
|
||||
|
||||
my_df2[genomicF] = my_df2[genomicF].fillna(value='unknown')
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue