added different scaling options

This commit is contained in:
Tanushree Tunstall 2022-07-05 22:47:13 +01:00
parent ebef0c7967
commit 8d831f3613
3 changed files with 99 additions and 31 deletions

View file

@ -142,7 +142,9 @@ scoreBT_mapD = {'bts_mcc' : 'MCC'
# Run Multiple Classifiers # Run Multiple Classifiers
############################ ############################
# Multiple Classification - Model Pipeline # Multiple Classification - Model Pipeline
def MultModelsCl(input_df, target, skf_cv def MultModelsCl(input_df, target
#, skf_cv
, sel_cv
, blind_test_df , blind_test_df
, blind_test_target , blind_test_target
, tts_split_type , tts_split_type
@ -150,7 +152,8 @@ def MultModelsCl(input_df, target, skf_cv
, resampling_type = 'none' # default , resampling_type = 'none' # default
, add_cm = True # adds confusion matrix based on cross_val_predict , add_cm = True # adds confusion matrix based on cross_val_predict
, add_yn = True # adds target var class numbers , add_yn = True # adds target var class numbers
, var_type = ['numerical', 'categorical','mixed'] , var_type = ['numerical', 'categorical','mixed']
, scale_numeric = ['min_max', 'std', 'min_max_neg', 'none']
, run_blind_test = True , run_blind_test = True
, return_formatted_output = True): , return_formatted_output = True):
@ -182,24 +185,52 @@ def MultModelsCl(input_df, target, skf_cv
#====================================================== #======================================================
# Determine preprocessing steps ~ var_type # Determine preprocessing steps ~ var_type
#====================================================== #======================================================
if var_type == 'numerical':
t = [('num', MinMaxScaler(), numerical_ix)] # if var_type == 'numerical':
# t = [('num', MinMaxScaler(), numerical_ix)]
# if var_type == 'categorical':
# t = [('cat', OneHotEncoder(), categorical_ix)]
# # if var_type == 'mixed':
# # t = [('num', MinMaxScaler(), numerical_ix)
# # , ('cat', OneHotEncoder(), categorical_ix) ]
# if var_type == 'mixed':
# t = [('cat', OneHotEncoder(), categorical_ix) ]
if type(var_type) == list:
var_type = str(var_type[0])
else:
var_type = var_type
if var_type in ['numerical','mixed']:
if scale_numeric == ['none']:
t = [('cat', OneHotEncoder(), categorical_ix)]
if scale_numeric != ['none']:
if scale_numeric == ['min_max']:
scaler = MinMaxScaler()
if scale_numeric == ['min_max_neg']:
scaler = MinMaxScaler(feature_range=(-1, 1))
if scale_numeric == ['std']:
scaler = StandardScaler()
t = [('num', scaler, numerical_ix)
, ('cat', OneHotEncoder(), categorical_ix)]
if var_type == 'categorical': if var_type == 'categorical':
t = [('cat', OneHotEncoder(), categorical_ix)] t = [('cat', OneHotEncoder(), categorical_ix)]
if var_type == 'mixed':
t = [('num', MinMaxScaler(), numerical_ix)
, ('cat', OneHotEncoder(), categorical_ix) ]
col_transform = ColumnTransformer(transformers = t col_transform = ColumnTransformer(transformers = t
, remainder='passthrough') , remainder='passthrough')
#====================================================== #======================================================
# Specify multiple Classification Models # Specify multiple Classification Models
#====================================================== #======================================================
models = [('AdaBoost Classifier' , AdaBoostClassifier(**rs) ) models = [('AdaBoost Classifier' , AdaBoostClassifier(**rs) )
, ('Bagging Classifier' , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True) ) , ('Bagging Classifier' , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True, verbose = 3, n_estimators = 100) )
, ('Decision Tree' , DecisionTreeClassifier(**rs) ) , ('Decision Tree' , DecisionTreeClassifier(**rs) )
, ('Extra Tree' , ExtraTreeClassifier(**rs) ) , ('Extra Tree' , ExtraTreeClassifier(**rs) )
, ('Extra Trees' , ExtraTreesClassifier(**rs) ) , ('Extra Trees' , ExtraTreesClassifier(**rs) )
@ -211,18 +242,18 @@ def MultModelsCl(input_df, target, skf_cv
, ('Logistic Regression' , LogisticRegression(**rs) ) , ('Logistic Regression' , LogisticRegression(**rs) )
, ('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs)) , ('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs))
, ('MLP' , MLPClassifier(max_iter = 500, **rs) ) , ('MLP' , MLPClassifier(max_iter = 500, **rs) )
, ('Multinomial' , MultinomialNB() ) #, ('Multinomial' , MultinomialNB() )
, ('Naive Bayes' , BernoulliNB() ) , ('Naive Bayes' , BernoulliNB() )
, ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) ) , ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) )
, ('QDA' , QuadraticDiscriminantAnalysis() ) , ('QDA' , QuadraticDiscriminantAnalysis() )
, ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000, **njobs ) ) , ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000, **njobs ) )
, ('Random Forest2' , RandomForestClassifier(min_samples_leaf = 5 # , ('Random Forest2' , RandomForestClassifier(min_samples_leaf = 5
, n_estimators = 1000 # , n_estimators = 1000
, bootstrap = True # , bootstrap = True
, oob_score = True # , oob_score = True
, **njobs # , **njobs
, **rs # , **rs
, max_features = 'auto') ) # , max_features = 'auto') )
, ('Ridge Classifier' , RidgeClassifier(**rs) ) , ('Ridge Classifier' , RidgeClassifier(**rs) )
, ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) ) , ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) )
, ('SVC' , SVC(**rs) ) , ('SVC' , SVC(**rs) )
@ -254,7 +285,7 @@ def MultModelsCl(input_df, target, skf_cv
skf_cv_modD = cross_validate(model_pipeline skf_cv_modD = cross_validate(model_pipeline
, input_df , input_df
, target , target
, cv = skf_cv , cv = sel_cv
, scoring = scoring_fn , scoring = scoring_fn
, return_train_score = True) , return_train_score = True)
#============================== #==============================
@ -283,7 +314,7 @@ def MultModelsCl(input_df, target, skf_cv
cmD = {} cmD = {}
# Calculate cm # Calculate cm
y_pred = cross_val_predict(model_pipeline, input_df, target, cv = skf_cv, **njobs) y_pred = cross_val_predict(model_pipeline, input_df, target, cv = sel_cv, **njobs)
#_tn, _fp, _fn, _tp = confusion_matrix(y_pred, y).ravel() # internally #_tn, _fp, _fn, _tp = confusion_matrix(y_pred, y).ravel() # internally
tn, fp, fn, tp = confusion_matrix(y_pred, target).ravel() tn, fp, fn, tp = confusion_matrix(y_pred, target).ravel()
@ -334,8 +365,9 @@ def MultModelsCl(input_df, target, skf_cv
bts_mcc_score = round(matthews_corrcoef(blind_test_target, bts_predict),2) bts_mcc_score = round(matthews_corrcoef(blind_test_target, bts_predict),2)
print('\nMCC on Blind test:' , bts_mcc_score) print('\nMCC on Blind test:' , bts_mcc_score)
print('\nAccuracy on Blind test:', round(accuracy_score(blind_test_target, bts_predict),2)) #print('\nAccuracy on Blind test:', round(accuracy_score(blind_test_target, bts_predict),2))
print('\nMCC on Training:' , mm_skf_scoresD[model_name]['test_mcc'] )
mm_skf_scoresD[model_name]['bts_mcc'] = bts_mcc_score mm_skf_scoresD[model_name]['bts_mcc'] = bts_mcc_score
mm_skf_scoresD[model_name]['bts_fscore'] = round(f1_score(blind_test_target, bts_predict),2) mm_skf_scoresD[model_name]['bts_fscore'] = round(f1_score(blind_test_target, bts_predict),2)
mm_skf_scoresD[model_name]['bts_precision'] = round(precision_score(blind_test_target, bts_predict),2) mm_skf_scoresD[model_name]['bts_precision'] = round(precision_score(blind_test_target, bts_predict),2)

View file

@ -26,7 +26,7 @@ from GetMLData import *
combined_model_paramD = {'data_combined_model' : True combined_model_paramD = {'data_combined_model' : True
, 'use_or' : False , 'use_or' : False
, 'omit_all_genomic_features': False , 'omit_all_genomic_features': False
, 'write_maskfile' : False , 'write_maskfile' : False # true once for writing and checking
, 'write_outfile' : False } , 'write_outfile' : False }
pnca_df = getmldata('pncA', 'pyrazinamide' , **combined_model_paramD) pnca_df = getmldata('pncA', 'pyrazinamide' , **combined_model_paramD)

View file

@ -14,7 +14,8 @@ sys.path
# import # import
from GetMLData import * from GetMLData import *
from SplitTTS import * from SplitTTS import *
from MultClfs_fi import * #from MultClfs_fi import *
from MultClfs import *
#%% #%%
# X,y = load_boston(return_X_y=True) # X,y = load_boston(return_X_y=True)
@ -33,7 +34,7 @@ from MultClfs_fi import *
#%% #%%
sel_cv = StratifiedKFold(n_splits = 10 skf_cv = StratifiedKFold(n_splits = 10
, shuffle = True,**rs) , shuffle = True,**rs)
#sel_cv = logo #sel_cv = logo
# sel_cv = RepeatedStratifiedKFold(n_splits = 5 # sel_cv = RepeatedStratifiedKFold(n_splits = 5
@ -48,10 +49,21 @@ gene_model_paramD = {'data_combined_model' : False
#df = getmldata(gene, drug, **gene_model_paramD) #df = getmldata(gene, drug, **gene_model_paramD)
df = getmldata('pncA', 'pyrazinamide', **gene_model_paramD) df = getmldata('pncA', 'pyrazinamide', **gene_model_paramD)
df = getmldata('embB', 'ethambutol' , **gene_model_paramD)
df = getmldata('katG', 'isoniazid' , **gene_model_paramD)
df = getmldata('rpoB', 'rifampicin' , **gene_model_paramD)
df = getmldata('gid' , 'streptomycin' , **gene_model_paramD)
#df = getmldata('alr' , 'cycloserine' , **combined_model_paramD)
all(df.columns.isin(['gene_name'])) # should be False
spl_type = '70_30'
spl_type = '80_20'
spl_type = 'sl'
df2 = split_tts(df df2 = split_tts(df
, data_type = 'actual' , data_type = 'actual'
, split_type = '70_30' , split_type = spl_type
, oversampling = False , oversampling = False
, dst_colname = 'dst' , dst_colname = 'dst'
, target_colname = 'dst_mode' , target_colname = 'dst_mode'
@ -61,19 +73,43 @@ df2 = split_tts(df
all(df2['X'].columns.isin(['gene_name'])) # should be False all(df2['X'].columns.isin(['gene_name'])) # should be False
fooD = MultClfs_fi (input_df = df2['X'] fooD = MultModelsCl(input_df = df2['X']
, target = df2['y'] , target = df2['y']
, sel_cv = sel_cv , sel_cv = skf_cv
, run_blind_test = True , run_blind_test = True
, blind_test_df = df2['X_bts'] , blind_test_df = df2['X_bts']
, blind_test_target = df2['y_bts'] , blind_test_target = df2['y_bts']
, tts_split_type = '70_30' , tts_split_type = spl_type
, var_type = 'mixed'
, resampling_type = 'none' # default , resampling_type = 'none' # default
) , var_type = ['mixed']
, scale_numeric = ['min_max_neg']
, return_formatted_output = False
)
for k, v in fooD.items(): for k, v in fooD.items():
print('\nModel:', k print('\nModel:', k
, '\nTRAIN MCC:', fooD[k]['test_mcc'] , '\nTRAIN MCC:', fooD[k]['test_mcc']
, '\nBTS MCC:' , fooD[k]['bts_mcc'] , '\nBTS MCC:' , fooD[k]['bts_mcc']
, '\nDIFF:',fooD[k]['bts_mcc'] - fooD[k]['test_mcc'] ) , '\nDIFF:',fooD[k]['bts_mcc'] - fooD[k]['test_mcc'] )
#%% CHECK SCALING
embb_df = getmldata('embB', 'ethambutol' , **combined_model_paramD)
all(embb_df.columns.isin(['gene_name'])) # should be False
scaler = MinMaxScaler(feature_range=(-1, 1))
bar = embb_df[['vdwclashes_rr', 'electro_rr']]
bar_df1 = scaler.fit_transform(bar)
bar_df1 = pd.DataFrame(bar_df1)
bar_df1.rename(columns = {0:'vdw_scaled', 1: 'ele_scaled'}, inplace = True)
bar2 = pd.concat([bar, bar_df1], axis = 1)
scaler2 = StandardScaler()
baz = embb_df[['vdwclashes_rr', 'electro_rr']]
baz_df1 = scaler2.fit_transform(baz)
baz_df1 = pd.DataFrame(baz_df1)
baz_df1.rename(columns = {0:'vdw_scaled', 1: 'ele_scaled'}, inplace = True)
baz2 = pd.concat([baz, baz_df1], axis = 1)
a = pd.concat([bar2, baz2], axis = 1)