#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Thu Jun 23 20:39:20 2022 @author: tanu """ import os, sys import pandas as pd import numpy as np import re ############################################################################## #%% FUNCTION: Process output dict from MultModelsCl ############################ # ProcessMultModelsCl() ############################ #Processes the dict from above if use_formatted_output = True def ProcessMultModelsCl(inputD = {}): scoresDF = pd.DataFrame(inputD) #------------------------ # Extracting split_name #----------------------- tts_split_nameL = [] for k,v in inputD.items(): tts_split_nameL = tts_split_nameL + [v['tts_split']] if len(set(tts_split_nameL)) == 1: tts_split_name = str(list(set(tts_split_nameL))[0]) print('\nExtracting tts_split_name:', tts_split_name) #------------------------ # WF: only CV and BTS #----------------------- scoresDFT = scoresDF.T scoresDF_CV = scoresDFT.filter(regex='^test_.*$', axis = 1); scoresDF_CV.columns # map colnames for consistency to allow concatenting scoresDF_CV.columns = scoresDF_CV.columns.map(scoreCV_mapD); scoresDF_CV.columns scoresDF_CV['source_data'] = 'CV' scoresDF_BT = scoresDFT.filter(regex='^bts_.*$', axis = 1); scoresDF_BT.columns # map colnames for consistency to allow concatenting scoresDF_BT.columns = scoresDF_BT.columns.map(scoreBT_mapD); scoresDF_BT.columns scoresDF_BT['source_data'] = 'BT' # dfs_combine_wf = [baseline_BT, smnc_BT, ros_BT, rus_BT, rouC_BT, # baseline_CV, smnc_CV, ros_CV, rus_CV, rouC_CV] #baseline_all = baseline_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0) #metaDF = scoresDFT.filter(regex='training_size|blind_test_size|_time|TN|FP|FN|TP|.*_neg|.*_pos|resampling', axis = 1); scoresDF_BT.columns #metaDF = scoresDFT.filter(regex='n_.*$|_time|TN|FP|FN|TP|.*_neg|.*_pos|resampling|tts.*', axis = 1); metaDF.columns metaDF = scoresDFT.filter(regex='^(?!test_.*$|bts_.*$|train_.*$).*'); metaDF.columns print('\nTotal cols in each df:' , '\nCV df:', len(scoresDF_CV.columns) , '\nBT_df:', len(scoresDF_BT.columns) , '\nmetaDF:', len(metaDF.columns)) if len(scoresDF_CV.columns) == len(scoresDF_BT.columns): print('\nFirst proceeding to rowbind CV and BT dfs:') expected_ncols_out = len(scoresDF_BT.columns) + len(metaDF.columns) print('\nFinal output should have:',expected_ncols_out, 'columns' ) #----------------- # Combine WF #----------------- dfs_combine_wf = [scoresDF_CV, scoresDF_BT] print('\nCombinig', len(dfs_combine_wf), 'using pd.concat by row ~ rowbind' , '\nChecking Dims of df to combine:' , '\nDim of CV:', scoresDF_CV.shape , '\nDim of BT:', scoresDF_BT.shape) #print(scoresDF_CV) #print(scoresDF_BT) dfs_nrows_wf = [] for df in dfs_combine_wf: dfs_nrows_wf = dfs_nrows_wf + [len(df)] dfs_nrows_wf = max(dfs_nrows_wf) dfs_ncols_wf = [] for df in dfs_combine_wf: dfs_ncols_wf = dfs_ncols_wf + [len(df.columns)] dfs_ncols_wf = max(dfs_ncols_wf) print(dfs_ncols_wf) expected_nrows_wf = len(dfs_combine_wf) * dfs_nrows_wf expected_ncols_wf = dfs_ncols_wf common_cols_wf = list(set.intersection(*(set(df.columns) for df in dfs_combine_wf))) print('\nNumber of Common columns:', dfs_ncols_wf , '\nThese are:', common_cols_wf) if len(common_cols_wf) == dfs_ncols_wf : combined_baseline_wf = pd.concat([df[common_cols_wf] for df in dfs_combine_wf], ignore_index=False) print('\nConcatenating dfs with different resampling methods [WF]:' , '\nSplit type:', tts_split_name , '\nNo. of dfs combining:', len(dfs_combine_wf)) #print('\n================================================^^^^^^^^^^^^') if len(combined_baseline_wf) == expected_nrows_wf and len(combined_baseline_wf.columns) == expected_ncols_wf: #print('\n================================================^^^^^^^^^^^^') print('\nPASS:', len(dfs_combine_wf), 'dfs successfully combined' , '\nnrows in combined_df_wf:', len(combined_baseline_wf) , '\nncols in combined_df_wf:', len(combined_baseline_wf.columns)) else: print('\nFAIL: concatenating failed' , '\nExpected nrows:', expected_nrows_wf , '\nGot:', len(combined_baseline_wf) , '\nExpected ncols:', expected_ncols_wf , '\nGot:', len(combined_baseline_wf.columns)) sys.exit('\nFIRST IF FAILS') else: print('\nConcatenting dfs not possible [WF],check numbers ') #------------------------------------- # Combine WF+Metadata: Final output #------------------------------------- # checking indices for the dfs to combine: c1 = list(set(combined_baseline_wf.index)) c2 = list(metaDF.index) if c1 == c2: print('\nPASS: proceeding to merge metadata with CV and BT dfs') combDF = pd.merge(combined_baseline_wf, metaDF, left_index = True, right_index = True) else: sys.exit('\nFAIL: Could not merge metadata with CV and BT dfs') if len(combDF.columns) == expected_ncols_out: print('\nPASS: Combined df has expected ncols') else: sys.exit('\nFAIL: Length mismatch for combined_df') print('\n=========================================================' , '\nSUCCESS: Ran multiple classifiers' , '\n=======================================================') #resampling_methods_wf = combined_baseline_wf[['resampling']] #resampling_methods_wf = resampling_methods_wf.drop_duplicates() #, '\n', resampling_methods_wf) return combDF