added ProcessMultModelsCl.py that processes the output for multiple models

2022-06-23 21:27:13 +01:00 · 2022-06-23 21:27:13 +01:00 · 1d3190899d
commit 1d3190899d
parent 4fe62c072b
1 changed files with 109 additions and 0 deletions
--- a/scripts/ml/ProcessMultModelsCl.py
+++ b/scripts/ml/ProcessMultModelsCl.py
@ -0,0 +1,109 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Jun 23 20:39:20 2022
+
+@author: tanu
+"""
+
+def ProcessMultModelCl(inputD = {}):
+    scoresDF = pd.DataFrame(inputD)
+    #------------------------
+    #  WF: only CV and BTS
+    #-----------------------
+    scoresDFT = scoresDF.T
+    
+    scoresDF_CV = scoresDFT.filter(regex='test_', axis = 1); scoresDF_CV.columns
+    # map colnames for consistency to allow concatenting
+    scoresDF_CV.columns = scoresDF_CV.columns.map(scoreCV_mapD); scoresDF_CV.columns
+    scoresDF_CV['Data_source'] = 'CV'
+    
+    scoresDF_BT = scoresDFT.filter(regex='bts_', axis = 1); scoresDF_BT.columns
+    # map colnames for consistency to allow concatenting
+    scoresDF_BT.columns = scoresDF_BT.columns.map(scoreBT_mapD); scoresDF_BT.columns
+    scoresDF_BT['Data_source'] = 'BT'
+    
+    # dfs_combine_wf = [baseline_BT, smnc_BT, ros_BT, rus_BT, rouC_BT,
+    #                   baseline_CV, smnc_CV, ros_CV, rus_CV, rouC_CV]
+    
+    #baseline_all = baseline_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
+
+    metaDF = scoresDFT.filter(regex='training_size|testSize|_time|TN|FP|FN|TP|.*_neg|.*_pos|resampling', axis = 1); scoresDF_BT.columns
+
+    #-----------------
+    # Combine WF
+    #-----------------
+    dfs_combine_wf = [scoresDF_CV, scoresDF_BT]
+    print('\n---------->\n', len(dfs_combine_wf))
+    print(scoresDF_CV)
+    print(scoresDF_BT)
+
+    print('\nCV dim:', scoresDF_CV.shape
+          , '\nBT dim:',scoresDF_BT.shape)
+
+    
+    dfs_nrows_wf = []
+    for df in dfs_combine_wf:
+        dfs_nrows_wf = dfs_nrows_wf + [len(df)]
+    dfs_nrows_wf = max(dfs_nrows_wf)
+        
+    dfs_ncols_wf = []
+    for df in dfs_combine_wf:
+        dfs_ncols_wf = dfs_ncols_wf + [len(df.columns)]
+    dfs_ncols_wf = max(dfs_ncols_wf)
+    print(dfs_ncols_wf)
+    
+    expected_nrows_wf = len(dfs_combine_wf) * dfs_nrows_wf
+    expected_ncols_wf = dfs_ncols_wf
+    
+    common_cols_wf = list(set.intersection(*(set(df.columns) for df in dfs_combine_wf)))
+    print('\nCOMMON COLS:', common_cols_wf
+          , dfs_ncols_wf)
+    
+    if len(common_cols_wf) == dfs_ncols_wf :
+        combined_baseline_wf = pd.concat([df[common_cols_wf] for df in dfs_combine_wf], ignore_index=False)
+        #resampling_methods_wf = combined_baseline_wf[['resampling']]
+        #resampling_methods_wf = resampling_methods_wf.drop_duplicates()
+        print('\nConcatenating dfs with different resampling methods [WF]:', tts_split
+              , '\nNo. of dfs combining:', len(dfs_combine_wf))
+        print('\n================================================^^^^^^^^^^^^')
+        if len(combined_baseline_wf) == expected_nrows_wf  and len(combined_baseline_wf.columns) == expected_ncols_wf:
+            print('\n================================================^^^^^^^^^^^^')
+
+            print('\nPASS:', len(dfs_combine_wf), 'dfs successfully combined'
+                  , '\nnrows in combined_df_wf:', len(combined_baseline_wf)
+                  , '\nncols in combined_df_wf:', len(combined_baseline_wf.columns))
+        else:
+            print('\nFAIL: concatenating failed'
+                  , '\nExpected nrows:', expected_nrows_wf
+                  , '\nGot:', len(combined_baseline_wf)
+                  , '\nExpected ncols:', expected_ncols_wf
+                  , '\nGot:', len(combined_baseline_wf.columns))
+            sys.exit('\nFIRST IF FAILS')
+    else:
+        print('\nConcatenting dfs not possible [WF],check numbers ')    
+    
+    
+    # TODOadd check here
+    combDF = pd.merge(combined_baseline_wf, metaDF, left_index = True, right_index = True)
+    #resampling_methods_wf = combined_baseline_wf[['resampling']]
+    #resampling_methods_wf = resampling_methods_wf.drop_duplicates()
+              #, '\n', resampling_methods_wf)
+
+    return combDF
+
+
+# test
+
+#ProcessMultModelCl(smnc_scores_mmD)
+bazDF = MultModelsCl(input_df = X_smnc
+                    , target = y_smnc
+                    , var_type = 'mixed'
+                    , tts_split_type = tts_split_7030
+                    , resampling_type = 'smnc'
+                    , skf_cv = skf_cv
+                    , blind_test_df = X_bts
+                    , blind_test_target = y_bts
+                    , add_cm = True 
+                    , add_yn = True
+                    , return_formatted_output = True)