modified loopity and multclass3 to have skf_cv as a parameters for cv

This commit is contained in:
Tanushree Tunstall 2022-03-17 18:17:58 +00:00
parent 97620c1bb0
commit d0c329a1d9
8 changed files with 161 additions and 127 deletions

View file

@ -61,23 +61,39 @@ from imblearn.combine import SMOTEENN
from imblearn.under_sampling import EditedNearestNeighbours from imblearn.under_sampling import EditedNearestNeighbours
#%% #%%
rs = {'random_state': 42} # rs = {'random_state': 42}
# Done: add preprocessing step with one hot encoder # njobs = {'n_jobs': 10}
# Done: get accuracy and other scores through K-fold stratified cv
scoring_fn = ({ 'fscore' : make_scorer(f1_score) scoring_fn = ({ 'fscore' : make_scorer(f1_score)
, 'mcc' : make_scorer(matthews_corrcoef) , 'mcc' : make_scorer(matthews_corrcoef)
, 'precision' : make_scorer(precision_score) , 'precision' : make_scorer(precision_score)
, 'recall' : make_scorer(recall_score) , 'recall' : make_scorer(recall_score)
, 'accuracy' : make_scorer(accuracy_score) , 'accuracy' : make_scorer(accuracy_score)
, 'roc_auc' : make_scorer(roc_auc_score) , 'roc_auc' : make_scorer(roc_auc_score)
#, 'jaccard' : make_scorer(jaccard_score) #, 'jaccard' : make_scorer(jaccard_score)
}) })
# Multiple Classification - Model Pipeline # Multiple Classification - Model Pipeline
def MultClassPipelineCV(X_train, X_test, y_train, y_test, input_df, var_type = ['numerical', 'categorical','mixed']): def MultClassPipeSKFCV(input_df, target, skf_cv, var_type = ['numerical', 'categorical','mixed']):
'''
@ param input_df: input features
@ type: df with input features WITHOUT the target variable
@param target: target (or output) feature
@type: df or np.array or Series
@param skv_cv: stratifiedK fold int or object to allow shuffle and random state to pass
@type: int or StratifiedKfold()
@var_type: numerical, categorical and mixed to determine what col_transform to apply (MinMaxScalar and/or one-ho t encoder)
@type: list
returns
Dict containing multiple classification scores for each model and mean of each Stratified Kfold including training
'''
# determine categorical and numerical features # determine categorical and numerical features
numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
numerical_ix numerical_ix
@ -98,66 +114,61 @@ def MultClassPipelineCV(X_train, X_test, y_train, y_test, input_df, var_type = [
col_transform = ColumnTransformer(transformers = t col_transform = ColumnTransformer(transformers = t
, remainder='passthrough') , remainder='passthrough')
#%% #%% Specify multiple Classification models
log_reg = LogisticRegression(**rs) log_reg = LogisticRegression(**rs)
nb = BernoulliNB() nb = BernoulliNB()
knn = KNeighborsClassifier() knn = KNeighborsClassifier()
svm = SVC(**rs) svm = SVC(**rs)
mlp = MLPClassifier(max_iter=500, **rs) mlp = MLPClassifier(max_iter = 500, **rs)
dt = DecisionTreeClassifier(**rs) dt = DecisionTreeClassifier(**rs)
et = ExtraTreesClassifier(**rs) et = ExtraTreesClassifier(**rs)
rf = RandomForestClassifier(**rs) rf = RandomForestClassifier(**rs)
rf2 = RandomForestClassifier( rf2 = RandomForestClassifier(
min_samples_leaf=50, min_samples_leaf = 50
n_estimators=150, , n_estimators = 150
bootstrap=True, , bootstrap = True
oob_score=True, , oob_score = True
n_jobs=-1, , **njobs
random_state=42, , **rs
max_features='auto') , max_features = 'auto')
xgb = XGBClassifier(**rs
xgb = XGBClassifier(**rs, verbosity=0) , verbosity = 0, use_label_encoder =False)
models = [ models = [('Logistic Regression', log_reg)
('Logistic Regression', log_reg), , ('Naive Bayes' , nb)
('Naive Bayes', nb), , ('K-Nearest Neighbors', knn)
('K-Nearest Neighbors', knn), , ('SVM' , svm)
('SVM', svm), , ('MLP' , mlp)
('MLP', mlp), , ('Decision Tree' , dt)
('Decision Tree', dt), , ('Extra Trees' , et)
('Extra Trees', et), , ('Random Forest' , rf)
('Random Forest', rf), , ('Naive Bayes' , nb)
('Random Forest2', rf2), , ('Random Forest2' , rf2)
#('XGBoost', xgb) , ('XGBoost' , xgb)]
]
mm_skf_scoresD = {}
skf_cv_scores = {}
for model_name, model_fn in models: for model_name, model_fn in models:
print('\nModel_name:', model_name print('\nModel_name:', model_name
, '\nModel func:' , model_fn , '\nModel func:' , model_fn
, '\nList of models:', models) , '\nList of models:', models)
# model_pipeline = Pipeline([
# ('pre' , MinMaxScaler())
# , ('model' , model_fn)])
model_pipeline = Pipeline([ model_pipeline = Pipeline([
('prep' , col_transform) ('prep' , col_transform)
, ('model' , model_fn)]) , ('model' , model_fn)])
print('Running model pipeline:', model_pipeline) print('Running model pipeline:', model_pipeline)
skf_cv = cross_validate(model_pipeline skf_cv_mod = cross_validate(model_pipeline
, X_train , input_df
, y_train , target
, cv = 10 , cv = skf_cv
, scoring = scoring_fn , scoring = scoring_fn
, return_train_score = True) , return_train_score = True)
skf_cv_scores[model_name] = {} mm_skf_scoresD[model_name] = {}
for key, value in skf_cv.items(): for key, value in skf_cv_mod.items():
print('\nkey:', key, '\nvalue:', value) print('\nkey:', key, '\nvalue:', value)
print('\nmean value:', mean(value)) print('\nmean value:', mean(value))
skf_cv_scores[model_name][key] = round(mean(value),2) mm_skf_scoresD[model_name][key] = round(mean(value),2)
#pp.pprint(skf_cv_scores) #pp.pprint(mm_skf_scoresD)
return(skf_cv_scores) return(mm_skf_scoresD)

View file

@ -5,29 +5,19 @@ Created on Tue Mar 15 11:09:50 2022
@author: tanu @author: tanu
""" """
# stratified shuffle split #%% Data
X_train, X_test, y_train, y_test = train_test_split(num_df_wtgt[numerical_FN] X = all_df_wtgt[numerical_FN+categorical_FN]
, num_df_wtgt['mutation_class'] y = all_df_wtgt['mutation_class']
, test_size = 0.33 #%% variables
, **rs
, shuffle = True
, stratify = num_df_wtgt['mutation_class'])
y_train.to_frame().value_counts().plot(kind = 'bar') #%% MultClassPipeSKFCV: function call()
y_test.to_frame().value_counts().plot(kind = 'bar') mm_skf_scoresD = MultClassPipeSKFCV(input_df = X
, target = y
MultClassPipelineCV(X_train, X_test, y_train, y_test , var_type = 'mixed'
, input_df = num_df_wtgt[numerical_FN] , skf_cv = skf_cv)
, var_type = 'numerical')
skf_cv_scores = MultClassPipelineCV(X_train, X_test, y_train, y_test mm_skf_scores_df_all = pd.DataFrame(mm_skf_scoresD)
, input_df = num_df_wtgt[numerical_FN] mm_skf_scores_df_all
, var_type = 'numerical') mm_skf_scores_df_test = mm_skf_scores_df_all.filter(like='test_', axis=0)
mm_skf_scores_df_train = mm_skf_scores_df_all.filter(like='train_', axis=0) # helps to see if you trust the results
pp.pprint(skf_cv_scores)
# construct a df
skf_cv_scores_df = pd.DataFrame(skf_cv_scores)
skf_cv_scores_df
skf_cv_scores_df_test = skf_cv_scores_df.filter(like='test_', axis=0)
skf_cv_scores_df_train = skf_cv_scores_df.filter(like='train_', axis=0)

View file

@ -138,6 +138,14 @@ parameters = [
#'tfidf__stop_words': [None], #'tfidf__stop_words': [None],
'clf__estimator__alpha': (1e-2, 1e-3, 1e-1), 'clf__estimator__alpha': (1e-2, 1e-3, 1e-1),
}, },
{
'clf__estimator': [LogisticRegression()],
'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
'penalty': ['none', 'l1', 'l2', 'elasticnet'],
'max_iter': list(range(100,800,100)),
'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
},
] ]
pipeline = Pipeline([ pipeline = Pipeline([

View file

@ -17,8 +17,12 @@ from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer from sklearn.compose import ColumnTransformer
@ -52,11 +56,29 @@ from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline from imblearn.pipeline import Pipeline
#from sklearn.datasets import make_classification #from sklearn.datasets import make_classification
from sklearn.model_selection import cross_validate from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import AdaBoostClassifier from sklearn.ensemble import AdaBoostClassifier
from imblearn.combine import SMOTEENN from imblearn.combine import SMOTEENN
from imblearn.under_sampling import EditedNearestNeighbours from imblearn.under_sampling import EditedNearestNeighbours
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator
scoring_fn = ({'accuracy' : make_scorer(accuracy_score)
, 'fscore' : make_scorer(f1_score)
, 'mcc' : make_scorer(matthews_corrcoef)
, 'precision' : make_scorer(precision_score)
, 'recall' : make_scorer(recall_score)
, 'roc_auc' : make_scorer(roc_auc_score)
})
rs = {'random_state': 42}
njobs = {'n_jobs': 10}
skf_cv = StratifiedKFold(n_splits = 10
#, shuffle = False, random_state= None)
, shuffle = True,**rs)
#%% #%%
homedir = os.path.expanduser("~") homedir = os.path.expanduser("~")
os.chdir(homedir + "/git/ML_AI_training/") os.chdir(homedir + "/git/ML_AI_training/")
@ -64,8 +86,8 @@ os.chdir(homedir + "/git/ML_AI_training/")
# my function # my function
from MultClassPipe import MultClassPipeline from MultClassPipe import MultClassPipeline
from MultClassPipe2 import MultClassPipeline2 from MultClassPipe2 import MultClassPipeline2
from loopity_loop import MultClassPipeSKF from loopity_loop import MultClassPipeSKFLoop
from MultClassPipe3 import MultClassPipelineCV from MultClassPipe3 import MultClassPipeSKFCV
gene = 'pncA' gene = 'pncA'
@ -199,3 +221,16 @@ cat_df_wtgt.shape
all_df_wtgt = my_df[numerical_FN + categorical_FN + ['mutation_class']] all_df_wtgt = my_df[numerical_FN + categorical_FN + ['mutation_class']]
all_df_wtgt.shape all_df_wtgt.shape
#%%
#%% Get train-test split and scoring functions
X = num_df_wtgt[numerical_FN]
y = num_df_wtgt['mutation_class']
X_train, X_test, y_train, y_test = train_test_split(X
,y
, test_size = 0.33
, random_state = 2
, shuffle = True
, stratify = y)

View file

@ -33,23 +33,30 @@ from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoe
from statistics import mean, stdev, median, mode from statistics import mean, stdev, median, mode
#%% #%%
rs = {'random_state': 42} rs = {'random_state': 42}
njobs = {'n_jobs': 10}
# Done: add preprocessing step with one hot encoder # Done: add preprocessing step with one hot encoder
# TODO: supply stratified K-fold cv train and test data # TODO: supply stratified K-fold cv train and test dataskf
# TODO: get accuracy and other scores through K-fold cv # TODO: get accuracy and other scores through K-fold cv
# Multiple Classification - Model Pipeline # Multiple Classification - Model Pipeline
def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical','mixed'], skf_splits = 10): def MultClassPipeSKFLoop(input_df, target, skf_cv, var_type = ['numerical','categorical','mixed']):
''' '''
@ param input_df: input features @ param input_df: input features
@ type: df (gets converted to np.array for stratified Kfold, and helps identify names to apply column transformation) @ type: df with input features WITHOUT the target variable
@param y_outputF: target (or output) feature @param target: target (or output) feature
@type: df or np.array @type: df or np.array or Series
@param skv_cv: stratifiedK fold int or object to allow shuffle and random state to pass
@type: int or StratifiedKfold()
@var_type: numerical, categorical and mixed to determine what col_transform to apply (MinMaxScalar and/or one-hot encoder)
@type: list
returns returns
multiple classification model scores Dict containing multiple classification scores for each model and each Stratified Kfold
''' '''
# Determine categorical and numerical features # Determine categorical and numerical features
@ -86,17 +93,17 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical'
, n_estimators = 150 , n_estimators = 150
, bootstrap = True , bootstrap = True
, oob_score = True , oob_score = True
, n_jobs = -1 , **njobs
, **rs , **rs
, max_features = 'auto') , max_features = 'auto')
xgb = XGBClassifier(**rs, verbosity = 0) xgb = XGBClassifier(**rs, verbosity = 0, use_label_encoder = False)
classification_metrics = { classification_metrics = {
'F1_score': [] 'F1_score': []
,'MCC': [] ,'MCC': []
,'Precision': [] ,'Precision': []
,'Recall': [] ,'Recall': []
,'Accuracy': [] , 'Accuracy': []
,'ROC_AUC': [] ,'ROC_AUC': []
} }
models = [ models = [
@ -109,33 +116,29 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical'
, ('Extra Trees' , et) , ('Extra Trees' , et)
, ('Random Forest' , rf) , ('Random Forest' , rf)
, ('Naive Bayes' , nb) , ('Naive Bayes' , nb)
, ('Random Forest2' , rf2)
, ('Random Forest2' , rf2) , ('XGBoost' , xgb)
#, ('XGBoost' , xgb)
] ]
skf = StratifiedKFold(n_splits = skf_splits # skf = StratifiedKFold(n_splits = 10
, shuffle = True # #, shuffle = False, random_state= None)
, **rs) # , shuffle = True,**rs)
# skf_dict = {}
fold_no = 1 fold_no = 1
fold_dict={} fold_dict={}
for model_name, model in models: for model_name, model in models:
fold_dict.update({ model_name: {}}) fold_dict.update({ model_name: {}})
#scores_df = pd.DataFrame() #scores_df = pd.DataFrame()
for train_index, test_index in skf.split(input_df, y_targetF): for train_index, test_index in skf_cv.split(input_df, target):
x_train_fold, x_test_fold = input_df.iloc[train_index], input_df.iloc[test_index] x_train_fold, x_test_fold = input_df.iloc[train_index], input_df.iloc[test_index]
y_train_fold, y_test_fold = y_targetF.iloc[train_index], y_targetF.iloc[test_index] y_train_fold, y_test_fold = target.iloc[train_index], target.iloc[test_index]
#print("Fold: ", fold_no, len(train_index), len(test_index)) #print("Fold: ", fold_no, len(train_index), len(test_index))
for model_name, model in models: for model_name, model in models:
print("\nStart of model", model_name, "\nLoop no.", fold_no) print("\nStart of model", model_name, "\nLoop no.", fold_no)
#skf_dict.update({model_name: classification_metrics }) model_pipeline = Pipeline(steps=[('prep' , col_transform)
model_pipeline = Pipeline(steps=[('prep' , col_transform)
, ('classifier' , model)]) , ('classifier' , model)])
model_pipeline.fit(x_train_fold, y_train_fold) model_pipeline.fit(x_train_fold, y_train_fold)
y_pred_fold = model_pipeline.predict(x_test_fold) y_pred_fold = model_pipeline.predict(x_test_fold)
@ -168,14 +171,4 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical'
fold_dict[model_name][fold].update({'ROC_AUC' : roc_auc}) fold_dict[model_name][fold].update({'ROC_AUC' : roc_auc})
fold_no +=1 fold_no +=1
#pp.pprint(skf_dict) return(fold_dict)
return(fold_dict)
#%% CAll function
# t3_res = MultClassPipeSKF(input_df = numerical_features_df
# , y_targetF = target1
# , var_type = 'numerical'
# , skf_splits = 10)
# pp.pprint(t3_res)
# #print(t3_res)

View file

@ -5,22 +5,19 @@ Created on Fri Mar 11 11:15:50 2022
@author: tanu @author: tanu
""" """
#%% #%% variables
del(t3_res) rs = {'random_state': 42}
# t3_res = MultClassPipeSKF(input_df = numerical_features_df
# , y_targetF = target1
# , var_type = 'numerical'
# , skf_splits = 10)
# pp.pprint(t3_res)
# #print(t3_res)
t3_res = MultClassPipeSKF(input_df = num_df_wtgt[numerical_FN] skf_cv = StratifiedKFold(n_splits = 10
, y_targetF = num_df_wtgt['mutation_class'] #, shuffle = False, random_state= None)
, shuffle = True,**rs)
#%% MultClassPipeSKFLoop: function call()
t3_res = MultClassPipeSKFLoop(input_df = num_df_wtgt[numerical_FN]
, target = num_df_wtgt['mutation_class']
, var_type = 'numerical' , var_type = 'numerical'
, skf_splits = 10) , skf_cv = skf_cv)
pp.pprint(t3_res) pp.pprint(t3_res)
#print(t3_res) #print(t3_res)
################################################################ ################################################################
# extract items from wwithin a nested dict # extract items from wwithin a nested dict
#%% Classification Metrics we need to mean() #%% Classification Metrics we need to mean()