made var names more meaniningful

2022-03-06 14:49:32 +00:00 · 2022-03-06 14:49:32 +00:00 · 6160d943f5
commit 6160d943f5
parent e2b997badf
2 changed files with 85 additions and 8 deletions
--- a/my_data7.py
+++ b/my_data7.py
@ -29,6 +29,9 @@ from sklearn.metrics import classification_report
 from sklearn.feature_selection import RFE
 from sklearn.feature_selection import RFECV

+#############################
+# trying feature selection
+#############################
 #%%
 model= Pipeline(steps = [
    ('pre', MinMaxScaler()),
@ -103,6 +106,11 @@ https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.Sequ
 https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFECV.html
 https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.f_regression.html
 https://www.scikit-yb.org/en/latest/api/model_selection/rfecv.html
+
+#GridSearchCV
+#ParameterGrid
+#RandomizedSearchCV
+#https://medium.com/analytics-vidhya/hyper-parameter-tuning-gridsearchcv-vs-randomizedsearchcv-499862e3ca5
 #%% RFE: Feature selection in classification
 # others in example
 # https://towardsdatascience.com/feature-selection-techniques-for-classification-and-python-tips-for-their-application-10c0ddd7918b
@ -152,8 +160,8 @@ X_train, X_test, y_train, y_test = train_test_split(X_vars11,
                                                    test_size = 0.33, 
                                                    random_state = 42)

-model_logistic = LogisticRegression()
-sel_rfe_logistic = RFECV(estimator = model_logistic
+model_logistic2 = LogisticRegression()
+sel_rfe_logistic = RFECV(estimator = model_logistic2
                       , cv = 10
                       , step = 1)

@ -162,3 +170,38 @@ print(sel_rfe_logistic.get_support())
 X_train.columns

 print(sel_rfe_logistic.ranking_)
+#%%
+# TODO: imputation
+# Find out the best way to impute values!
+#from sklearn.impute import SimpleImputer
+# https://towardsdatascience.com/whats-the-best-way-to-handle-nan-values-62d50f738fc
+#KNN and MICE
+my_df2 = pd.read_csv(infile_ml1)
+
+genomicF = ['af'
+        , 'beta_logistic'
+        , 'or_logistic'
+        , 'pval_logistic'
+        , 'se_logistic'
+        , 'zval_logistic'
+        , 'ci_low_logistic'
+        , 'ci_hi_logistic'
+        , 'or_mychisq'
+        , 'log10_or_mychisq'
+        , 'or_fisher'
+        , 'pval_fisher'
+        , 'neglog_pval_fisher'
+        , 'ci_low_fisher'
+        , 'ci_hi_fisher'
+        , 'est_chisq'
+        , 'pval_chisq']
+
+# X_genomicF =  ['af'
+#             , 'or_mychisq'
+#             , 'or_logistic'
+#             , 'or_fisher'
+#             , 'pval_fisher']
+
+my_df2[genomicF].isna().sum()
+
+my_df2[genomicF] = my_df2[genomicF].fillna(value='unknown')