LSHTM_analysis/scripts/ml/ProcessMultModelsCl.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Jun 23 20:39:20 2022

@author: tanu
"""
import os, sys
import pandas as pd
import numpy as np
import re
##############################################################################
#%% FUNCTION: Process output dict from MultModelsCl
############################
# ProcessMultModelsCl()
############################
#Processes the dict from above if use_formatted_output = True

def ProcessMultModelsCl(inputD = {}):

    scoresDF = pd.DataFrame(inputD)

    #------------------------
    #  Extracting split_name
    #-----------------------
    tts_split_nameL = []
    for k,v in inputD.items():
        tts_split_nameL = tts_split_nameL + [v['tts_split']]

    if len(set(tts_split_nameL)) == 1:
        tts_split_name = str(list(set(tts_split_nameL))[0])
        print('\nExtracting tts_split_name:', tts_split_name)

    #------------------------
    #  WF: only CV and BTS
    #-----------------------
    scoresDFT = scoresDF.T

    scoresDF_CV = scoresDFT.filter(regex='^test_.*$', axis = 1); scoresDF_CV.columns
    # map colnames for consistency to allow concatenting
    scoresDF_CV.columns = scoresDF_CV.columns.map(scoreCV_mapD); scoresDF_CV.columns
    scoresDF_CV['source_data'] = 'CV'

    scoresDF_BT = scoresDFT.filter(regex='^bts_.*$', axis = 1); scoresDF_BT.columns
    # map colnames for consistency to allow concatenting
    scoresDF_BT.columns = scoresDF_BT.columns.map(scoreBT_mapD); scoresDF_BT.columns
    scoresDF_BT['source_data'] = 'BT'

    # dfs_combine_wf = [baseline_BT, smnc_BT, ros_BT, rus_BT, rouC_BT,
    #                   baseline_CV, smnc_CV, ros_CV, rus_CV, rouC_CV]

    #baseline_all = baseline_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)

    #metaDF = scoresDFT.filter(regex='training_size|blind_test_size|_time|TN|FP|FN|TP|.*_neg|.*_pos|resampling', axis = 1); scoresDF_BT.columns
    #metaDF = scoresDFT.filter(regex='n_.*$|_time|TN|FP|FN|TP|.*_neg|.*_pos|resampling|tts.*', axis = 1); metaDF.columns
    metaDF = scoresDFT.filter(regex='^(?!test_.*$|bts_.*$|train_.*$).*'); metaDF.columns

    print('\nTotal cols in each df:'
          , '\nCV df:', len(scoresDF_CV.columns)
          , '\nBT_df:', len(scoresDF_BT.columns)
          , '\nmetaDF:', len(metaDF.columns))
    if  len(scoresDF_CV.columns) == len(scoresDF_BT.columns):
        print('\nFirst proceeding to rowbind CV and BT dfs:')
        expected_ncols_out = len(scoresDF_BT.columns) + len(metaDF.columns)
        print('\nFinal output should have:',expected_ncols_out, 'columns' )

    #-----------------
    # Combine WF
    #-----------------
    dfs_combine_wf = [scoresDF_CV, scoresDF_BT]

    print('\nCombinig', len(dfs_combine_wf), 'using pd.concat by row ~ rowbind'
          , '\nChecking Dims of df to combine:'
          , '\nDim of CV:', scoresDF_CV.shape
          , '\nDim of BT:', scoresDF_BT.shape)
    #print(scoresDF_CV)
    #print(scoresDF_BT)

    dfs_nrows_wf = []
    for df in dfs_combine_wf:
        dfs_nrows_wf = dfs_nrows_wf + [len(df)]
    dfs_nrows_wf = max(dfs_nrows_wf)

    dfs_ncols_wf = []
    for df in dfs_combine_wf:
        dfs_ncols_wf = dfs_ncols_wf + [len(df.columns)]
    dfs_ncols_wf = max(dfs_ncols_wf)
    print(dfs_ncols_wf)

    expected_nrows_wf = len(dfs_combine_wf) * dfs_nrows_wf
    expected_ncols_wf = dfs_ncols_wf

    common_cols_wf = list(set.intersection(*(set(df.columns) for df in dfs_combine_wf)))
    print('\nNumber of Common columns:', dfs_ncols_wf
          , '\nThese are:', common_cols_wf)

    if len(common_cols_wf) == dfs_ncols_wf :
        combined_baseline_wf = pd.concat([df[common_cols_wf] for df in dfs_combine_wf], ignore_index=False)
        print('\nConcatenating dfs with different resampling methods [WF]:'
              , '\nSplit type:', tts_split_name
              , '\nNo. of dfs combining:', len(dfs_combine_wf))
        #print('\n================================================^^^^^^^^^^^^')
        if len(combined_baseline_wf) == expected_nrows_wf  and len(combined_baseline_wf.columns) == expected_ncols_wf:
            #print('\n================================================^^^^^^^^^^^^')

            print('\nPASS:', len(dfs_combine_wf), 'dfs successfully combined'
                  , '\nnrows in combined_df_wf:', len(combined_baseline_wf)
                  , '\nncols in combined_df_wf:', len(combined_baseline_wf.columns))
        else:
            print('\nFAIL: concatenating failed'
                  , '\nExpected nrows:', expected_nrows_wf
                  , '\nGot:', len(combined_baseline_wf)
                  , '\nExpected ncols:', expected_ncols_wf
                  , '\nGot:', len(combined_baseline_wf.columns))
            sys.exit('\nFIRST IF FAILS')
    else:
        print('\nConcatenting dfs not possible [WF],check numbers ')


    #-------------------------------------
    # Combine WF+Metadata: Final output
    #-------------------------------------
    # checking indices for the dfs to combine:
    c1 = list(set(combined_baseline_wf.index))
    c2 = list(metaDF.index)

    if c1 == c2:
        print('\nPASS: proceeding to merge metadata with CV and BT dfs')
        combDF = pd.merge(combined_baseline_wf, metaDF, left_index = True, right_index = True)
    else:
        sys.exit('\nFAIL: Could not merge metadata with CV and BT dfs')

    if len(combDF.columns) == expected_ncols_out:
        print('\nPASS: Combined df has expected ncols')
    else:
        sys.exit('\nFAIL: Length mismatch for combined_df')

    print('\n========================================================='
          , '\nSUCCESS: Ran multiple classifiers'
          , '\n=======================================================')

    #resampling_methods_wf = combined_baseline_wf[['resampling']]
    #resampling_methods_wf = resampling_methods_wf.drop_duplicates()
              #, '\n', resampling_methods_wf)

    return combDF