From 1d3190899de1062fea8dc1ae72d0bb909c5b8abd Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Thu, 23 Jun 2022 21:27:13 +0100 Subject: [PATCH] added ProcessMultModelsCl.py that processes the output for multiple models --- scripts/ml/ProcessMultModelsCl.py | 109 ++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 scripts/ml/ProcessMultModelsCl.py diff --git a/scripts/ml/ProcessMultModelsCl.py b/scripts/ml/ProcessMultModelsCl.py new file mode 100644 index 0000000..f2276b0 --- /dev/null +++ b/scripts/ml/ProcessMultModelsCl.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Thu Jun 23 20:39:20 2022 + +@author: tanu +""" + +def ProcessMultModelCl(inputD = {}): + scoresDF = pd.DataFrame(inputD) + #------------------------ + # WF: only CV and BTS + #----------------------- + scoresDFT = scoresDF.T + + scoresDF_CV = scoresDFT.filter(regex='test_', axis = 1); scoresDF_CV.columns + # map colnames for consistency to allow concatenting + scoresDF_CV.columns = scoresDF_CV.columns.map(scoreCV_mapD); scoresDF_CV.columns + scoresDF_CV['Data_source'] = 'CV' + + scoresDF_BT = scoresDFT.filter(regex='bts_', axis = 1); scoresDF_BT.columns + # map colnames for consistency to allow concatenting + scoresDF_BT.columns = scoresDF_BT.columns.map(scoreBT_mapD); scoresDF_BT.columns + scoresDF_BT['Data_source'] = 'BT' + + # dfs_combine_wf = [baseline_BT, smnc_BT, ros_BT, rus_BT, rouC_BT, + # baseline_CV, smnc_CV, ros_CV, rus_CV, rouC_CV] + + #baseline_all = baseline_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0) + + metaDF = scoresDFT.filter(regex='training_size|testSize|_time|TN|FP|FN|TP|.*_neg|.*_pos|resampling', axis = 1); scoresDF_BT.columns + + #----------------- + # Combine WF + #----------------- + dfs_combine_wf = [scoresDF_CV, scoresDF_BT] + print('\n---------->\n', len(dfs_combine_wf)) + print(scoresDF_CV) + print(scoresDF_BT) + + print('\nCV dim:', scoresDF_CV.shape + , '\nBT dim:',scoresDF_BT.shape) + + + dfs_nrows_wf = [] + for df in dfs_combine_wf: + dfs_nrows_wf = dfs_nrows_wf + [len(df)] + dfs_nrows_wf = max(dfs_nrows_wf) + + dfs_ncols_wf = [] + for df in dfs_combine_wf: + dfs_ncols_wf = dfs_ncols_wf + [len(df.columns)] + dfs_ncols_wf = max(dfs_ncols_wf) + print(dfs_ncols_wf) + + expected_nrows_wf = len(dfs_combine_wf) * dfs_nrows_wf + expected_ncols_wf = dfs_ncols_wf + + common_cols_wf = list(set.intersection(*(set(df.columns) for df in dfs_combine_wf))) + print('\nCOMMON COLS:', common_cols_wf + , dfs_ncols_wf) + + if len(common_cols_wf) == dfs_ncols_wf : + combined_baseline_wf = pd.concat([df[common_cols_wf] for df in dfs_combine_wf], ignore_index=False) + #resampling_methods_wf = combined_baseline_wf[['resampling']] + #resampling_methods_wf = resampling_methods_wf.drop_duplicates() + print('\nConcatenating dfs with different resampling methods [WF]:', tts_split + , '\nNo. of dfs combining:', len(dfs_combine_wf)) + print('\n================================================^^^^^^^^^^^^') + if len(combined_baseline_wf) == expected_nrows_wf and len(combined_baseline_wf.columns) == expected_ncols_wf: + print('\n================================================^^^^^^^^^^^^') + + print('\nPASS:', len(dfs_combine_wf), 'dfs successfully combined' + , '\nnrows in combined_df_wf:', len(combined_baseline_wf) + , '\nncols in combined_df_wf:', len(combined_baseline_wf.columns)) + else: + print('\nFAIL: concatenating failed' + , '\nExpected nrows:', expected_nrows_wf + , '\nGot:', len(combined_baseline_wf) + , '\nExpected ncols:', expected_ncols_wf + , '\nGot:', len(combined_baseline_wf.columns)) + sys.exit('\nFIRST IF FAILS') + else: + print('\nConcatenting dfs not possible [WF],check numbers ') + + + # TODOadd check here + combDF = pd.merge(combined_baseline_wf, metaDF, left_index = True, right_index = True) + #resampling_methods_wf = combined_baseline_wf[['resampling']] + #resampling_methods_wf = resampling_methods_wf.drop_duplicates() + #, '\n', resampling_methods_wf) + + return combDF + + +# test + +#ProcessMultModelCl(smnc_scores_mmD) +bazDF = MultModelsCl(input_df = X_smnc + , target = y_smnc + , var_type = 'mixed' + , tts_split_type = tts_split_7030 + , resampling_type = 'smnc' + , skf_cv = skf_cv + , blind_test_df = X_bts + , blind_test_target = y_bts + , add_cm = True + , add_yn = True + , return_formatted_output = True) \ No newline at end of file