From 19da36842b76903e78cadd23572857952dd514ed Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Fri, 24 Jun 2022 13:24:04 +0100 Subject: [PATCH] removed the two functions MultModelsCl.py and ProcessMultModelsCl.py as these have now been combined --- scripts/ml/MultModelsCl.py | 14 ++++-- scripts/ml/ProcessMultModelsCl.py | 78 +++++++++++++++++++++---------- 2 files changed, 63 insertions(+), 29 deletions(-) diff --git a/scripts/ml/MultModelsCl.py b/scripts/ml/MultModelsCl.py index fcc36ee..d68b117 100755 --- a/scripts/ml/MultModelsCl.py +++ b/scripts/ml/MultModelsCl.py @@ -105,6 +105,10 @@ jacc_score_fn = {'jcc': make_scorer(jaccard_score)} from ProcessMultModelsCl import * #%% +############################ +# MultModelsCl() +# Run Multiple Classifiers +############################ # Multiple Classification - Model Pipeline def MultModelsCl(input_df, target, skf_cv , blind_test_df @@ -340,17 +344,17 @@ def MultModelsCl(input_df, target, skf_cv mm_skf_scoresD[model_name]['n_training_size'] = len(input_df) mm_skf_scoresD[model_name]['n_trainingY_ratio'] = round(yc1_ratio, 2) - mm_skf_scoresD[model_name]['n_blind_test_size'] = len(blind_test_df) - mm_skf_scoresD[model_name]['n_testY_ratio'] = round(yc2_ratio,2) - mm_skf_scoresD[model_name]['n_features'] = len(input_df.columns) - mm_skf_scoresD[model_name]['tts_split'] = tts_split_type + mm_skf_scoresD[model_name]['n_test_size'] = len(blind_test_df) + mm_skf_scoresD[model_name]['n_testY_ratio'] = round(yc2_ratio,2) + mm_skf_scoresD[model_name]['n_features'] = len(input_df.columns) + mm_skf_scoresD[model_name]['tts_split'] = tts_split_type #return(mm_skf_scoresD) #============================ # Process the dict to have WF #============================ if return_formatted_output: - CV_BT_metaDF = ProcessMultModelCl(mm_skf_scoresD) + CV_BT_metaDF = ProcessMultModelsCl(mm_skf_scoresD) return(CV_BT_metaDF) else: return(mm_skf_scoresD) \ No newline at end of file diff --git a/scripts/ml/ProcessMultModelsCl.py b/scripts/ml/ProcessMultModelsCl.py index d25baa6..2d26f01 100644 --- a/scripts/ml/ProcessMultModelsCl.py +++ b/scripts/ml/ProcessMultModelsCl.py @@ -10,7 +10,12 @@ import pandas as pd import numpy as np import re ############################################################################## -#%% FUNCTION: Process outout dicr from MultModelsCl +#%% FUNCTION: Process output dict from MultModelsCl +############################ +# ProcessMultModelsCl() +############################ +#Processes the dict from above if use_formatted_output = True + def ProcessMultModelsCl(inputD = {}): scoresDF = pd.DataFrame(inputD) @@ -25,43 +30,52 @@ def ProcessMultModelsCl(inputD = {}): if len(set(tts_split_nameL)) == 1: tts_split_name = str(list(set(tts_split_nameL))[0]) print('\nExtracting tts_split_name:', tts_split_name) - + #------------------------ # WF: only CV and BTS #----------------------- scoresDFT = scoresDF.T - scoresDF_CV = scoresDFT.filter(regex='test_', axis = 1); scoresDF_CV.columns + scoresDF_CV = scoresDFT.filter(regex='^test_.*$', axis = 1); scoresDF_CV.columns # map colnames for consistency to allow concatenting scoresDF_CV.columns = scoresDF_CV.columns.map(scoreCV_mapD); scoresDF_CV.columns - scoresDF_CV['Data_source'] = 'CV' + scoresDF_CV['source_data'] = 'CV' - scoresDF_BT = scoresDFT.filter(regex='bts_', axis = 1); scoresDF_BT.columns + scoresDF_BT = scoresDFT.filter(regex='^bts_.*$', axis = 1); scoresDF_BT.columns # map colnames for consistency to allow concatenting scoresDF_BT.columns = scoresDF_BT.columns.map(scoreBT_mapD); scoresDF_BT.columns - scoresDF_BT['Data_source'] = 'BT' + scoresDF_BT['source_data'] = 'BT' # dfs_combine_wf = [baseline_BT, smnc_BT, ros_BT, rus_BT, rouC_BT, # baseline_CV, smnc_CV, ros_CV, rus_CV, rouC_CV] #baseline_all = baseline_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0) - metaDF = scoresDFT.filter(regex='training_size|blind_test_size|_time|TN|FP|FN|TP|.*_neg|.*_pos|resampling', axis = 1); scoresDF_BT.columns + #metaDF = scoresDFT.filter(regex='training_size|blind_test_size|_time|TN|FP|FN|TP|.*_neg|.*_pos|resampling', axis = 1); scoresDF_BT.columns + #metaDF = scoresDFT.filter(regex='n_.*$|_time|TN|FP|FN|TP|.*_neg|.*_pos|resampling|tts.*', axis = 1); metaDF.columns + metaDF = scoresDFT.filter(regex='^(?!test_.*$|bts_.*$|train_.*$).*'); metaDF.columns + + print('\nTotal cols in each df:' + , '\nCV df:', len(scoresDF_CV.columns) + , '\nBT_df:', len(scoresDF_BT.columns) + , '\nmetaDF:', len(metaDF.columns)) + if len(scoresDF_CV.columns) == len(scoresDF_BT.columns): + print('\nFirst proceeding to rowbind CV and BT dfs:') + expected_ncols_out = len(scoresDF_BT.columns) + len(metaDF.columns) + print('\nFinal output should have:',expected_ncols_out, 'columns' ) #----------------- # Combine WF #----------------- dfs_combine_wf = [scoresDF_CV, scoresDF_BT] - print('\n---------->\n', len(dfs_combine_wf)) - print(scoresDF_CV) - print(scoresDF_BT) print('\nCombinig', len(dfs_combine_wf), 'using pd.concat by row ~ rowbind' , '\nChecking Dims of df to combine:' , '\nDim of CV:', scoresDF_CV.shape , '\nDim of BT:', scoresDF_BT.shape) + #print(scoresDF_CV) + #print(scoresDF_BT) - dfs_nrows_wf = [] for df in dfs_combine_wf: dfs_nrows_wf = dfs_nrows_wf + [len(df)] @@ -77,19 +91,17 @@ def ProcessMultModelsCl(inputD = {}): expected_ncols_wf = dfs_ncols_wf common_cols_wf = list(set.intersection(*(set(df.columns) for df in dfs_combine_wf))) - print('\nFinding Common cols to ensure row bind is correct:', len(common_cols_wf) - , '\nCOMMON cols are:', common_cols_wf - , dfs_ncols_wf) + print('\nNumber of Common columns:', dfs_ncols_wf + , '\nThese are:', common_cols_wf) if len(common_cols_wf) == dfs_ncols_wf : combined_baseline_wf = pd.concat([df[common_cols_wf] for df in dfs_combine_wf], ignore_index=False) - #resampling_methods_wf = combined_baseline_wf[['resampling']] - #resampling_methods_wf = resampling_methods_wf.drop_duplicates() - print('\nConcatenating dfs with different resampling methods [WF]:', tts_split_name + print('\nConcatenating dfs with different resampling methods [WF]:' + , '\nSplit type:', tts_split_name , '\nNo. of dfs combining:', len(dfs_combine_wf)) - print('\n================================================^^^^^^^^^^^^') + #print('\n================================================^^^^^^^^^^^^') if len(combined_baseline_wf) == expected_nrows_wf and len(combined_baseline_wf.columns) == expected_ncols_wf: - print('\n================================================^^^^^^^^^^^^') + #print('\n================================================^^^^^^^^^^^^') print('\nPASS:', len(dfs_combine_wf), 'dfs successfully combined' , '\nnrows in combined_df_wf:', len(combined_baseline_wf) @@ -105,12 +117,30 @@ def ProcessMultModelsCl(inputD = {}): print('\nConcatenting dfs not possible [WF],check numbers ') - # TODOadd check here - combDF = pd.merge(combined_baseline_wf, metaDF, left_index = True, right_index = True) + #------------------------------------- + # Combine WF+Metadata: Final output + #------------------------------------- + # checking indices for the dfs to combine: + c1 = list(set(combined_baseline_wf.index)) + c2 = list(metaDF.index) + + if c1 == c2: + print('\nPASS: proceeding to merge metadata with CV and BT dfs') + combDF = pd.merge(combined_baseline_wf, metaDF, left_index = True, right_index = True) + else: + sys.exit('\nFAIL: Could not merge metadata with CV and BT dfs') + + if len(combDF.columns) == expected_ncols_out: + print('\nPASS: Combined df has expected ncols') + else: + sys.exit('\nFAIL: Length mismatch for combined_df') + + print('\n=========================================================' + , '\nSUCCESS: Ran multiple classifiers' + , '\n=======================================================') + #resampling_methods_wf = combined_baseline_wf[['resampling']] #resampling_methods_wf = resampling_methods_wf.drop_duplicates() #, '\n', resampling_methods_wf) - return combDF - -############################################################################### + return combDF \ No newline at end of file