#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Sat Mar 5 12:57:32 2022 @author: tanu """ import os, sys import pandas as pd import numpy as np from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import BernoulliNB from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier from sklearn.neural_network import MLPClassifier from sklearn.pipeline import Pipeline from xgboost import XGBClassifier from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import MinMaxScaler from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score from sklearn.model_selection import cross_validate from sklearn.metrics import make_scorer from sklearn.metrics import classification_report from sklearn.feature_selection import RFE from sklearn.feature_selection import RFECV ############################# # trying feature selection ############################# #%% model= Pipeline(steps = [ ('pre', MinMaxScaler()), ('reg', LogisticRegression(class_weight = 'balanced'))]) def precision(y_true,y_pred): return precision_score(y_true,y_pred,pos_label = 1) def recall(y_true,y_pred): return recall_score(y_true, y_pred, pos_label = 1) def f1(y_true,y_pred): return f1_score(y_true, y_pred, pos_label = 1) acc = make_scorer(accuracy_score) prec = make_scorer(precision) rec = make_scorer(recall) f1 = make_scorer(f1) output = cross_validate(model, X_train, y_train , scoring = {'acc' : acc ,'prec': prec ,'rec' : rec ,'f1' : f1} , cv = 10 , return_train_score = False) pd.DataFrame(output).mean() #%% # classification_repor: lowest scores but does it give numbers for all your classes! model.fit(X_train, y_train) y_pred = model.predict(X_test) f1_score(y_test, y_pred) roc_auc_score (y_test, y_pred) # not sure! #roc_curve(y_test, y_pred) classification_report(y_test, y_pred) target_names = {1:'Resistant', 0:'Sensitive'} print(classification_report(y_test , y_pred #, target_names=y_test.map(target_names) )) #%%NOT SURE! from itertools import combinations def train(X): return cross_validate(model, X, y_train #, scoring = make_scorer(accuracy_score) , scoring = {'acc' : acc ,'prec' : prec ,'rec' : rec ,'f1' : f1} , cv = 10 , return_train_score = False) #, return_estimator = True)['test_score'] scores = [train(X_train.loc[:,vars]) for vars in combinations(X_train.columns, len(X_train.columns))] means = [score.mean() for score in scores] means #%% # TO TRY https://rasbt.github.io/mlxtend/ # stackoverflow # informative post https://datascience.stackexchange.com/questions/937/does-scikit-learn-have-a-forward-selection-stepwise-regression-algorithm https://datascience.stackexchange.com/questions/24405/how-to-do-stepwise-regression-using-sklearn/24447#24447 https://stats.stackexchange.com/questions/204141/difference-between-selecting-features-based-on-f-regression-and-based-on-r2 # 0.24 version, it supports https://scikit-learn.org/stable/auto_examples/release_highlights/plot_release_highlights_0_24_0.html#new-sequentialfeatureselector-transformer https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectFromModel.html https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SequentialFeatureSelector.html https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFECV.html https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.f_regression.html https://www.scikit-yb.org/en/latest/api/model_selection/rfecv.html #GridSearchCV #ParameterGrid #RandomizedSearchCV #https://medium.com/analytics-vidhya/hyper-parameter-tuning-gridsearchcv-vs-randomizedsearchcv-499862e3ca5 #%% RFE: Feature selection in classification # others in example # https://towardsdatascience.com/feature-selection-techniques-for-classification-and-python-tips-for-their-application-10c0ddd7918b # https://towardsdatascience.com/feature-selection-using-python-for-classification-problem-b5f00a1c7028 model_logistic = LogisticRegression(solver='lbfgs' , multi_class = 'multinomial' , max_iter = 1000) model_logistic = LogisticRegression() sel_rfe_logistic = RFE(estimator = model_logistic , n_features_to_select = 4 , step = 1) X_train_rfe_logistic = sel_rfe_logistic.fit_transform(X_train, y_train) print(sel_rfe_logistic.get_support()) print(sel_rfe_logistic.ranking_) #%% RFECV # https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFECV.html target = target1 target = target3 target = target4 X_train, X_test, y_train, y_test = train_test_split(X_vars1, target, test_size = 0.33, random_state = 42) X_train, X_test, y_train, y_test = train_test_split(X_vars2, target, test_size = 0.33, random_state = 42) X_train, X_test, y_train, y_test = train_test_split(X_vars3, target, test_size = 0.33, random_state = 42) X_train, X_test, y_train, y_test = train_test_split(X_vars5, target, test_size = 0.33, random_state = 42) X_train, X_test, y_train, y_test = train_test_split(X_vars11, target, test_size = 0.33, random_state = 42) model_logistic2 = LogisticRegression() sel_rfe_logistic = RFECV(estimator = model_logistic2 , cv = 10 , step = 1) X_train_rfe_logistic = sel_rfe_logistic.fit_transform(X_train, y_train) print(sel_rfe_logistic.get_support()) X_train.columns print(sel_rfe_logistic.ranking_) #%% # TODO: imputation # Find out the best way to impute values! #from sklearn.impute import SimpleImputer # https://towardsdatascience.com/whats-the-best-way-to-handle-nan-values-62d50f738fc #KNN and MICE my_df2 = pd.read_csv(infile_ml1) genomicF = ['af' , 'beta_logistic' , 'or_logistic' , 'pval_logistic' , 'se_logistic' , 'zval_logistic' , 'ci_low_logistic' , 'ci_hi_logistic' , 'or_mychisq' , 'log10_or_mychisq' , 'or_fisher' , 'pval_fisher' , 'neglog_pval_fisher' , 'ci_low_fisher' , 'ci_hi_fisher' , 'est_chisq' , 'pval_chisq'] # X_genomicF = ['af' # , 'or_mychisq' # , 'or_logistic' # , 'or_fisher' # , 'pval_fisher'] my_df2[genomicF].isna().sum() my_df2[genomicF] = my_df2[genomicF].fillna(value='unknown')