From 19da36842b76903e78cadd23572857952dd514ed Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Fri, 24 Jun 2022 13:24:04 +0100
Subject: [PATCH] removed the two functions MultModelsCl.py and
 ProcessMultModelsCl.py as these have now been combined

---
 scripts/ml/MultModelsCl.py        | 14 ++++--
 scripts/ml/ProcessMultModelsCl.py | 78 +++++++++++++++++++++----------
 2 files changed, 63 insertions(+), 29 deletions(-)

diff --git a/scripts/ml/MultModelsCl.py b/scripts/ml/MultModelsCl.py
index fcc36ee..d68b117 100755
--- a/scripts/ml/MultModelsCl.py
+++ b/scripts/ml/MultModelsCl.py
@@ -105,6 +105,10 @@ jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
 from ProcessMultModelsCl import *
 
 #%%
+############################
+# MultModelsCl()
+# Run Multiple Classifiers
+############################
 # Multiple Classification - Model Pipeline
 def MultModelsCl(input_df, target, skf_cv
                        , blind_test_df
@@ -340,17 +344,17 @@ def MultModelsCl(input_df, target, skf_cv
         mm_skf_scoresD[model_name]['n_training_size']   = len(input_df)
         mm_skf_scoresD[model_name]['n_trainingY_ratio'] = round(yc1_ratio, 2)
        
-        mm_skf_scoresD[model_name]['n_blind_test_size'] = len(blind_test_df)
-        mm_skf_scoresD[model_name]['n_testY_ratio']     = round(yc2_ratio,2)
-        mm_skf_scoresD[model_name]['n_features']        = len(input_df.columns)
-        mm_skf_scoresD[model_name]['tts_split']         = tts_split_type
+        mm_skf_scoresD[model_name]['n_test_size']     = len(blind_test_df)
+        mm_skf_scoresD[model_name]['n_testY_ratio']   = round(yc2_ratio,2)
+        mm_skf_scoresD[model_name]['n_features']      = len(input_df.columns)
+        mm_skf_scoresD[model_name]['tts_split']       = tts_split_type
 
     #return(mm_skf_scoresD)
     #============================
     # Process the dict to have WF
     #============================
     if return_formatted_output: 
-        CV_BT_metaDF = ProcessMultModelCl(mm_skf_scoresD)
+        CV_BT_metaDF = ProcessMultModelsCl(mm_skf_scoresD)
         return(CV_BT_metaDF)
     else:
         return(mm_skf_scoresD)
\ No newline at end of file
diff --git a/scripts/ml/ProcessMultModelsCl.py b/scripts/ml/ProcessMultModelsCl.py
index d25baa6..2d26f01 100644
--- a/scripts/ml/ProcessMultModelsCl.py
+++ b/scripts/ml/ProcessMultModelsCl.py
@@ -10,7 +10,12 @@ import pandas as pd
 import numpy as np
 import re
 ##############################################################################
-#%% FUNCTION: Process outout dicr from MultModelsCl
+#%% FUNCTION: Process output dict from MultModelsCl
+############################
+# ProcessMultModelsCl() 
+############################
+#Processes the dict from above if use_formatted_output = True 
+
 def ProcessMultModelsCl(inputD = {}):
     
     scoresDF = pd.DataFrame(inputD)
@@ -25,43 +30,52 @@ def ProcessMultModelsCl(inputD = {}):
     if len(set(tts_split_nameL)) == 1:
         tts_split_name = str(list(set(tts_split_nameL))[0])
         print('\nExtracting tts_split_name:', tts_split_name)
-        
+    
     #------------------------
     #  WF: only CV and BTS
     #-----------------------
     scoresDFT = scoresDF.T
     
-    scoresDF_CV = scoresDFT.filter(regex='test_', axis = 1); scoresDF_CV.columns
+    scoresDF_CV = scoresDFT.filter(regex='^test_.*$', axis = 1); scoresDF_CV.columns
     # map colnames for consistency to allow concatenting
     scoresDF_CV.columns = scoresDF_CV.columns.map(scoreCV_mapD); scoresDF_CV.columns
-    scoresDF_CV['Data_source'] = 'CV'
+    scoresDF_CV['source_data'] = 'CV'
     
-    scoresDF_BT = scoresDFT.filter(regex='bts_', axis = 1); scoresDF_BT.columns
+    scoresDF_BT = scoresDFT.filter(regex='^bts_.*$', axis = 1); scoresDF_BT.columns
     # map colnames for consistency to allow concatenting
     scoresDF_BT.columns = scoresDF_BT.columns.map(scoreBT_mapD); scoresDF_BT.columns
-    scoresDF_BT['Data_source'] = 'BT'
+    scoresDF_BT['source_data'] = 'BT'
     
     # dfs_combine_wf = [baseline_BT, smnc_BT, ros_BT, rus_BT, rouC_BT,
     #                   baseline_CV, smnc_CV, ros_CV, rus_CV, rouC_CV]
     
     #baseline_all = baseline_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
 
-    metaDF = scoresDFT.filter(regex='training_size|blind_test_size|_time|TN|FP|FN|TP|.*_neg|.*_pos|resampling', axis = 1); scoresDF_BT.columns
+    #metaDF = scoresDFT.filter(regex='training_size|blind_test_size|_time|TN|FP|FN|TP|.*_neg|.*_pos|resampling', axis = 1); scoresDF_BT.columns
+    #metaDF = scoresDFT.filter(regex='n_.*$|_time|TN|FP|FN|TP|.*_neg|.*_pos|resampling|tts.*', axis = 1); metaDF.columns
+    metaDF = scoresDFT.filter(regex='^(?!test_.*$|bts_.*$|train_.*$).*'); metaDF.columns
+    
+    print('\nTotal cols in each df:'
+          , '\nCV df:', len(scoresDF_CV.columns)
+          , '\nBT_df:', len(scoresDF_BT.columns)
+          , '\nmetaDF:', len(metaDF.columns))
+    if  len(scoresDF_CV.columns) == len(scoresDF_BT.columns):
+        print('\nFirst proceeding to rowbind CV and BT dfs:')
+        expected_ncols_out = len(scoresDF_BT.columns) + len(metaDF.columns)
+        print('\nFinal output should have:',expected_ncols_out, 'columns' )
 
     #-----------------
     # Combine WF
     #-----------------
     dfs_combine_wf = [scoresDF_CV, scoresDF_BT]
-    print('\n---------->\n', len(dfs_combine_wf))
-    print(scoresDF_CV)
-    print(scoresDF_BT)
 
     print('\nCombinig', len(dfs_combine_wf), 'using pd.concat by row ~ rowbind'
           , '\nChecking Dims of df to combine:'
           , '\nDim of CV:', scoresDF_CV.shape
           , '\nDim of BT:', scoresDF_BT.shape)
+    #print(scoresDF_CV)
+    #print(scoresDF_BT)
 
-    
     dfs_nrows_wf = []
     for df in dfs_combine_wf:
         dfs_nrows_wf = dfs_nrows_wf + [len(df)]
@@ -77,19 +91,17 @@ def ProcessMultModelsCl(inputD = {}):
     expected_ncols_wf = dfs_ncols_wf
     
     common_cols_wf = list(set.intersection(*(set(df.columns) for df in dfs_combine_wf)))
-    print('\nFinding Common cols to ensure row bind is correct:', len(common_cols_wf)
-          , '\nCOMMON cols are:', common_cols_wf
-          , dfs_ncols_wf)
+    print('\nNumber of Common columns:', dfs_ncols_wf
+          , '\nThese are:', common_cols_wf)
     
     if len(common_cols_wf) == dfs_ncols_wf :
         combined_baseline_wf = pd.concat([df[common_cols_wf] for df in dfs_combine_wf], ignore_index=False)
-        #resampling_methods_wf = combined_baseline_wf[['resampling']]
-        #resampling_methods_wf = resampling_methods_wf.drop_duplicates()
-        print('\nConcatenating dfs with different resampling methods [WF]:', tts_split_name
+        print('\nConcatenating dfs with different resampling methods [WF]:'
+              , '\nSplit type:', tts_split_name
               , '\nNo. of dfs combining:', len(dfs_combine_wf))
-        print('\n================================================^^^^^^^^^^^^')
+        #print('\n================================================^^^^^^^^^^^^')
         if len(combined_baseline_wf) == expected_nrows_wf  and len(combined_baseline_wf.columns) == expected_ncols_wf:
-            print('\n================================================^^^^^^^^^^^^')
+            #print('\n================================================^^^^^^^^^^^^')
 
             print('\nPASS:', len(dfs_combine_wf), 'dfs successfully combined'
                   , '\nnrows in combined_df_wf:', len(combined_baseline_wf)
@@ -105,12 +117,30 @@ def ProcessMultModelsCl(inputD = {}):
         print('\nConcatenting dfs not possible [WF],check numbers ')    
     
     
-    # TODOadd check here
-    combDF = pd.merge(combined_baseline_wf, metaDF, left_index = True, right_index = True)
+    #-------------------------------------
+    # Combine WF+Metadata: Final output
+    #-------------------------------------
+    # checking indices for the dfs to combine:
+    c1 = list(set(combined_baseline_wf.index))
+    c2 = list(metaDF.index)
+    
+    if c1 == c2:
+        print('\nPASS: proceeding to merge metadata with CV and BT dfs')
+        combDF = pd.merge(combined_baseline_wf, metaDF, left_index = True, right_index = True)
+    else:
+        sys.exit('\nFAIL: Could not merge metadata with CV and BT dfs')
+    
+    if len(combDF.columns) == expected_ncols_out:
+        print('\nPASS: Combined df has expected ncols')
+    else:
+        sys.exit('\nFAIL: Length mismatch for combined_df')
+    
+    print('\n========================================================='
+          , '\nSUCCESS: Ran multiple classifiers'
+          , '\n=======================================================')
+        
     #resampling_methods_wf = combined_baseline_wf[['resampling']]
     #resampling_methods_wf = resampling_methods_wf.drop_duplicates()
               #, '\n', resampling_methods_wf)
 
-    return combDF
-
-###############################################################################
+    return combDF
\ No newline at end of file