trying under and oversampling

This commit is contained in:
Tanushree Tunstall 2022-05-26 07:38:21 +01:00
parent 8f8306d948
commit 5779331981
5 changed files with 129 additions and 16 deletions

View file

@ -60,6 +60,18 @@ from sklearn.ensemble import AdaBoostClassifier
from imblearn.combine import SMOTEENN from imblearn.combine import SMOTEENN
from imblearn.under_sampling import EditedNearestNeighbours from imblearn.under_sampling import EditedNearestNeighbours
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import RidgeClassifier, SGDClassifier, PassiveAggressiveClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
#%% #%%
rs = {'random_state': 42} rs = {'random_state': 42}
njobs = {'n_jobs': 10} njobs = {'n_jobs': 10}
@ -122,8 +134,7 @@ def MultClassPipeSKFCV(input_df, target, skf_cv, var_type = ['numerical', 'categ
mlp = MLPClassifier(max_iter = 500, **rs) mlp = MLPClassifier(max_iter = 500, **rs)
dt = DecisionTreeClassifier(**rs) dt = DecisionTreeClassifier(**rs)
et = ExtraTreesClassifier(**rs) et = ExtraTreesClassifier(**rs)
rf = RandomForestClassifier(**rs, rf = RandomForestClassifier(**rs, n_estimators = 1000 )
n_estimators = 1000 )
rf2 = RandomForestClassifier( rf2 = RandomForestClassifier(
min_samples_leaf = 5 min_samples_leaf = 5
, n_estimators = 100 #10 , n_estimators = 100 #10
@ -136,7 +147,7 @@ def MultClassPipeSKFCV(input_df, target, skf_cv, var_type = ['numerical', 'categ
lda = LinearDiscriminantAnalysis() lda = LinearDiscriminantAnalysis()
mnb = MultinomialNB(**rs) mnb = MultinomialNB()
pa = PassiveAggressiveClassifier(**rs, **njobs) pa = PassiveAggressiveClassifier(**rs, **njobs)

View file

@ -7,7 +7,7 @@ Created on Tue Mar 15 11:09:50 2022
""" """
#%% Data #%% Data
X = all_df_wtgt[numerical_FN+categorical_FN] X = all_df_wtgt[numerical_FN+categorical_FN]
y = all_df_wtgt['mutation_class'] y = all_df_wtgt[drug]
#y = all_df_wtgt['dst_mode'] #y = all_df_wtgt['dst_mode']
#%% variables #%% variables
@ -17,9 +17,40 @@ mm_skf_scoresD = MultClassPipeSKFCV(input_df = X
, var_type = 'mixed' , var_type = 'mixed'
, skf_cv = skf_cv) , skf_cv = skf_cv)
mm_skf_scores_df_all = pd.DataFrame(mm_skf_scoresD) mm_skf_scores_df_all = pd.DataFrame(mm_skf_scoresD)
mm_skf_scores_df_all mm_skf_scores_df_all
mm_skf_scores_df_test = mm_skf_scores_df_all.filter(like='test_', axis=0) mm_skf_scores_df_test = mm_skf_scores_df_all.filter(like='test_', axis=0)
mm_skf_scores_df_train = mm_skf_scores_df_all.filter(like='train_', axis=0) # helps to see if you trust the results mm_skf_scores_df_train = mm_skf_scores_df_all.filter(like='train_', axis=0) # helps to see if you trust the results
]
#%%
mm_skf_scoresD2 = MultClassPipeSKFCV(input_df = X_sm
, target = y_sm
, var_type = 'mixed'
, skf_cv = skf_cv)
sm_all = pd.DataFrame(mm_skf_scoresD2)
sm_df_CT = sm_all.filter(like='test_', axis=0)
#%%
mm_skf_scoresD3 = MultClassPipeSKFCV(input_df = X_ros
, target = y_ros
, var_type = 'mixed'
, skf_cv = skf_cv)
ros_all = pd.DataFrame(mm_skf_scoresD3)
ros_CT = ros_all.filter(like='test_', axis=0)
#%%
mm_skf_scoresD4 = MultClassPipeSKFCV(input_df = X_rus
, target = y_rus
, var_type = 'mixed'
, skf_cv = skf_cv)
rus_all = pd.DataFrame(mm_skf_scoresD4)
rus_CT = rus_all.filter(like='test_', axis=0)
#%%
mm_skf_scoresD5 = MultClassPipeSKFCV(input_df = X_enn
, target = y_enn
, var_type = 'mixed'
, skf_cv = skf_cv)
enn_all = pd.DataFrame(mm_skf_scoresD5)
enn_CT = enn_all.filter(like='test_', axis=0)

View file

@ -36,7 +36,8 @@ from sklearn.gaussian_process.kernels import WhiteKernel
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import RidgeClassifier from sklearn.linear_model import RidgeClassifier, SGDClassifier, PassiveAggressiveClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC from sklearn.svm import SVC
from xgboost import XGBClassifier from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB from sklearn.naive_bayes import MultinomialNB
@ -72,6 +73,7 @@ print(pd.__version__)
from statistics import mean, stdev, median, mode from statistics import mean, stdev, median, mode
from imblearn.over_sampling import RandomOverSampler from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline from imblearn.pipeline import Pipeline
from sklearn.datasets import make_classification from sklearn.datasets import make_classification
@ -81,6 +83,7 @@ from sklearn.ensemble import AdaBoostClassifier
from imblearn.combine import SMOTEENN from imblearn.combine import SMOTEENN
from imblearn.under_sampling import EditedNearestNeighbours from imblearn.under_sampling import EditedNearestNeighbours
from sklearn.model_selection import GridSearchCV from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator from sklearn.base import BaseEstimator
import json import json
@ -119,6 +122,10 @@ from MultClassPipe3 import MultClassPipeSKFCV
gene = 'pncA' gene = 'pncA'
drug = 'pyrazinamide' drug = 'pyrazinamide'
#gene = 'katG'
#drug = 'isoniazid'
#============== #==============
# directories # directories
#============== #==============
@ -234,13 +241,13 @@ numerical_FN = common_cols_stabiltyN + foldX_cols + X_strFN + X_evolFN + X_genom
#categorical feature names #categorical feature names
categorical_FN = ['ss_class' categorical_FN = ['ss_class'
, 'wt_prop_water' # , 'wt_prop_water'
# , 'lineage_labels' # misleading if using merged_df3 # , 'lineage_labels' # misleading if using merged_df3
, 'mut_prop_water' # , 'mut_prop_water'
, 'wt_prop_polarity' # , 'wt_prop_polarity'
, 'mut_prop_polarity' # , 'mut_prop_polarity'
, 'wt_calcprop' # , 'wt_calcprop'
, 'mut_calcprop' # , 'mut_calcprop'
#, 'active_aa_pos' #, 'active_aa_pos'
] ]
@ -278,9 +285,9 @@ all_df_wtgt.shape
#------ #------
# X # X
#------ #------
X = all_df_wtgt[numerical_FN + categorical_FN] # training data ALL #X = all_df_wtgt[numerical_FN + categorical_FN] # training data ALL
X_bts = blind_test_df[numerical_FN + categorical_FN] # blind test data ALL X_bts = blind_test_df[numerical_FN + categorical_FN] # blind test data ALL
#X = all_df_wtgt[numerical_FN] # training numerical only X = all_df_wtgt[numerical_FN] # training numerical only
#X_bts = blind_test_df[numerical_FN] # blind test data numerical #X_bts = blind_test_df[numerical_FN] # blind test data numerical
#------ #------

View file

@ -574,6 +574,7 @@ param_grid_svc = [
#======== #========
# LDA # LDA
# https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/lda.py # https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/lda.py
# https://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.LinearDiscriminantAnalysis.html
#======== #========
estimator = LinearDiscriminantAnalysis() estimator = LinearDiscriminantAnalysis()
@ -605,9 +606,10 @@ param_grid_lda = [
#======== #========
# Multinomial_nb # Multinomial_nb
# https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/multinomial_nb.py # https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/multinomial_nb.py
# https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html
#======== #========
estimator = MultinomialNB(**rs) estimator = MultinomialNB()
# Define pipleline with steps # Define pipleline with steps
pipe_mnb = Pipeline([ pipe_mnb = Pipeline([
@ -635,6 +637,7 @@ param_grid_mnb = [
#======== #========
# passive_aggressive # passive_aggressive
# https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/passive_aggressive.py # https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/passive_aggressive.py
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.PassiveAggressiveClassifier.html
#======== #========
estimator = PassiveAggressiveClassifier(**rs, **njobs) estimator = PassiveAggressiveClassifier(**rs, **njobs)
@ -668,6 +671,7 @@ param_grid_pa = [
#======== #========
# SGD # SGD
# https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/sgd.py # https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/sgd.py
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html
#======== #========
estimator = SGDClassifier(**rs, **njobs) estimator = SGDClassifier(**rs, **njobs)

View file

@ -0,0 +1,60 @@
import pandas as pd
import numpy as np
import scipy as sp
import time
import sys
import os
import re
import argparse
from math import sqrt
from scipy import stats
import joblib
# Alogorithm
from xgboost.sklearn import XGBClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.utils import all_estimators
# Pre-processing
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, LeaveOneOut, KFold, RepeatedKFold, cross_val_predict
# Metric
from sklearn.metrics import mean_squared_error, make_scorer, roc_auc_score, f1_score, matthews_corrcoef, accuracy_score, balanced_accuracy_score, confusion_matrix, classification_report
def run_all_ML(input_pd, target_label):
#y = input_pd[target_label]
#X = input_pd.drop(target_label,axis=1)
y = target_label
X = input_pd
result_pd = pd.DataFrame()
for name, algorithm in all_estimators(type_filter="classifier"):
try:
estmator = algorithm()
temp_pd = pd.DataFrame()
temp_cm = pd.DataFrame()
pipe = Pipeline([
("model", algorithm())
])
y_pred = cross_val_predict(pipe, X, y, cv = 10, n_jobs=10)
_mcc = round(matthews_corrcoef(y_pred, y), 3)
_bacc = round(balanced_accuracy_score(y_pred, y), 3)
_f1 = round(f1_score(y_pred, y), 3)
_roc_auc = round(roc_auc_score(y_pred, y), 3)
_tn, _fp, _fn, _tp = confusion_matrix(y_pred, y).ravel()
result_pd = result_pd.append(pd.DataFrame(np.column_stack([name, _tp, _tn, _fp, _fn, _roc_auc, _mcc, _bacc, _f1]),\
columns=['estimator', 'TP', 'TN', 'FP', 'FN',
'roc_auc', 'matthew', 'bacc', 'f1']),\
ignore_index=True)
except Exception as e:
print("Got an error while running {}".format(name))
print(e)
return(result_pd)