trying under and oversampling

This commit is contained in:
Tanushree Tunstall 2022-05-26 07:38:21 +01:00
parent 8f8306d948
commit 5779331981
5 changed files with 129 additions and 16 deletions

View file

@ -36,7 +36,8 @@ from sklearn.gaussian_process.kernels import WhiteKernel
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import RidgeClassifier, SGDClassifier, PassiveAggressiveClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB
@ -72,6 +73,7 @@ print(pd.__version__)
from statistics import mean, stdev, median, mode
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.datasets import make_classification
@ -81,6 +83,7 @@ from sklearn.ensemble import AdaBoostClassifier
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import EditedNearestNeighbours
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator
import json
@ -119,6 +122,10 @@ from MultClassPipe3 import MultClassPipeSKFCV
gene = 'pncA'
drug = 'pyrazinamide'
#gene = 'katG'
#drug = 'isoniazid'
#==============
# directories
#==============
@ -234,13 +241,13 @@ numerical_FN = common_cols_stabiltyN + foldX_cols + X_strFN + X_evolFN + X_genom
#categorical feature names
categorical_FN = ['ss_class'
, 'wt_prop_water'
# , 'wt_prop_water'
# , 'lineage_labels' # misleading if using merged_df3
, 'mut_prop_water'
, 'wt_prop_polarity'
, 'mut_prop_polarity'
, 'wt_calcprop'
, 'mut_calcprop'
# , 'mut_prop_water'
# , 'wt_prop_polarity'
# , 'mut_prop_polarity'
# , 'wt_calcprop'
# , 'mut_calcprop'
#, 'active_aa_pos'
]
@ -278,9 +285,9 @@ all_df_wtgt.shape
#------
# X
#------
X = all_df_wtgt[numerical_FN + categorical_FN] # training data ALL
#X = all_df_wtgt[numerical_FN + categorical_FN] # training data ALL
X_bts = blind_test_df[numerical_FN + categorical_FN] # blind test data ALL
#X = all_df_wtgt[numerical_FN] # training numerical only
X = all_df_wtgt[numerical_FN] # training numerical only
#X_bts = blind_test_df[numerical_FN] # blind test data numerical
#------