added file containing model names and hyperaprams to run for all models inc FS

This commit is contained in:
Tanushree Tunstall 2022-05-24 09:14:41 +01:00
parent 9c07ad3ce8
commit 5d6dccfc09
6 changed files with 536 additions and 299 deletions

View file

@ -10,19 +10,26 @@ Created on Mon May 23 23:25:26 2022
def fsgs(input_df
, target
, blind_test_df = pd.DataFrame()
, blind_test_target = pd.Series(dtype = 'int64')
#, y_trueS = pd.Series()
, estimator = LogisticRegression(**rs)
, param_gridLd = {}
, cv_method = 10
, cv_method = StratifiedKFold(n_splits = 10
, shuffle = True,**rs)
, var_type = ['numerical'
, 'categorical'
, 'mixed']
, fs_estimator = [LogisticRegression(**rs)]
, fs = RFECV(DecisionTreeClassifier(**rs) , cv = 10, scoring = 'matthews_corrcoef')
, fs = RFECV(DecisionTreeClassifier(**rs)
, cv = StratifiedKFold(n_splits = 10
, shuffle = True,**rs)
, scoring = 'matthews_corrcoef')
):
'''
returns
Dict containing results from FS and hyperparam tuning
Dict containing results from FS and hyperparam tuning for a given estiamtor
>>> ADD MORE <<<
optimised/selected based on mcc
'''
# Determine categorical and numerical features
numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
@ -68,11 +75,10 @@ def fsgs(input_df
############################################################################
# Create Pipeline object
pipe = Pipeline([
#('pre', MinMaxScaler()),
('pre', col_transform),
('fs', fs),
#('clf', LogisticRegression(**rs))])
('clf', estimator)])
('pre', col_transform),
('fs', fs),
#('clf', LogisticRegression(**rs))])
('clf', estimator)])
############################################################################
# Define GridSearchCV
gscv_fs = GridSearchCV(pipe
@ -119,8 +125,8 @@ def fsgs(input_df
#tp = gscv_fs.predict(X_bts)
tp = gscv_fs.predict(blind_test_df)
print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, tp),2))
print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, tp),2))
print('\nMCC on Blind test:' , round(matthews_corrcoef(blind_test_target, tp),2))
print('\nAccuracy on Blind test:', round(accuracy_score(blind_test_target, tp),2))
#=================
# info extraction
@ -191,9 +197,9 @@ def fsgs(input_df
#bts_predict = gscv_fs.predict(X_bts)
bts_predict = gscv_fs.predict(blind_test_df)
print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, bts_predict),2))
print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, bts_predict),2))
bts_mcc_score = round(matthews_corrcoef(y_bts, bts_predict),2)
print('\nMCC on Blind test:' , round(matthews_corrcoef(blind_test_target, bts_predict),2))
print('\nAccuracy on Blind test:', round(accuracy_score(blind_test_target, bts_predict),2))
bts_mcc_score = round(matthews_corrcoef(blind_test_target, bts_predict),2)
# Diff b/w train and bts test scores
train_test_diff = train_bscore - bts_mcc_score
@ -213,12 +219,12 @@ def fsgs(input_df
lr_btsD
#lr_btsD['bts_mcc'] = bts_mcc_score
lr_btsD['bts_fscore'] = round(f1_score(y_bts, bts_predict),2)
lr_btsD['bts_precision'] = round(precision_score(y_bts, bts_predict),2)
lr_btsD['bts_recall'] = round(recall_score(y_bts, bts_predict),2)
lr_btsD['bts_accuracy'] = round(accuracy_score(y_bts, bts_predict),2)
lr_btsD['bts_roc_auc'] = round(roc_auc_score(y_bts, bts_predict),2)
lr_btsD['bts_jaccard'] = round(jaccard_score(y_bts, bts_predict),2)
lr_btsD['bts_fscore'] = round(f1_score(blind_test_target, bts_predict),2)
lr_btsD['bts_precision'] = round(precision_score(blind_test_target, bts_predict),2)
lr_btsD['bts_recall'] = round(recall_score(blind_test_target, bts_predict),2)
lr_btsD['bts_accuracy'] = round(accuracy_score(blind_test_target, bts_predict),2)
lr_btsD['bts_roc_auc'] = round(roc_auc_score(blind_test_target, bts_predict),2)
lr_btsD['bts_jaccard'] = round(jaccard_score(blind_test_target, bts_predict),2)
lr_btsD
#===========================
@ -229,7 +235,7 @@ def fsgs(input_df
fs_methodf = str(gscv_fs.best_estimator_.named_steps['fs'])
all_featuresL = list(all_features)
fs_res_arrayf = str(list( gscv_fs.best_estimator_.named_steps['fs'].get_support()))
fs_res_array_rankf = list( gscv_fs.best_estimator_.named_steps['fs'].ranking_)
fs_res_array_rankf = str(list( gscv_fs.best_estimator_.named_steps['fs'].ranking_))
sel_featuresf = list(sel_features)
n_sf = int(n_sf)