LSHTM_analysis/scripts/ml/ProcessMultModelsCl.py
2022-06-24 13:21:21 +01:00

116 lines
4.6 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Jun 23 20:39:20 2022
@author: tanu
"""
import os, sys
import pandas as pd
import numpy as np
import re
##############################################################################
#%% FUNCTION: Process outout dicr from MultModelsCl
def ProcessMultModelsCl(inputD = {}):
scoresDF = pd.DataFrame(inputD)
#------------------------
# Extracting split_name
#-----------------------
tts_split_nameL = []
for k,v in inputD.items():
tts_split_nameL = tts_split_nameL + [v['tts_split']]
if len(set(tts_split_nameL)) == 1:
tts_split_name = str(list(set(tts_split_nameL))[0])
print('\nExtracting tts_split_name:', tts_split_name)
#------------------------
# WF: only CV and BTS
#-----------------------
scoresDFT = scoresDF.T
scoresDF_CV = scoresDFT.filter(regex='test_', axis = 1); scoresDF_CV.columns
# map colnames for consistency to allow concatenting
scoresDF_CV.columns = scoresDF_CV.columns.map(scoreCV_mapD); scoresDF_CV.columns
scoresDF_CV['Data_source'] = 'CV'
scoresDF_BT = scoresDFT.filter(regex='bts_', axis = 1); scoresDF_BT.columns
# map colnames for consistency to allow concatenting
scoresDF_BT.columns = scoresDF_BT.columns.map(scoreBT_mapD); scoresDF_BT.columns
scoresDF_BT['Data_source'] = 'BT'
# dfs_combine_wf = [baseline_BT, smnc_BT, ros_BT, rus_BT, rouC_BT,
# baseline_CV, smnc_CV, ros_CV, rus_CV, rouC_CV]
#baseline_all = baseline_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
metaDF = scoresDFT.filter(regex='training_size|blind_test_size|_time|TN|FP|FN|TP|.*_neg|.*_pos|resampling', axis = 1); scoresDF_BT.columns
#-----------------
# Combine WF
#-----------------
dfs_combine_wf = [scoresDF_CV, scoresDF_BT]
print('\n---------->\n', len(dfs_combine_wf))
print(scoresDF_CV)
print(scoresDF_BT)
print('\nCombinig', len(dfs_combine_wf), 'using pd.concat by row ~ rowbind'
, '\nChecking Dims of df to combine:'
, '\nDim of CV:', scoresDF_CV.shape
, '\nDim of BT:', scoresDF_BT.shape)
dfs_nrows_wf = []
for df in dfs_combine_wf:
dfs_nrows_wf = dfs_nrows_wf + [len(df)]
dfs_nrows_wf = max(dfs_nrows_wf)
dfs_ncols_wf = []
for df in dfs_combine_wf:
dfs_ncols_wf = dfs_ncols_wf + [len(df.columns)]
dfs_ncols_wf = max(dfs_ncols_wf)
print(dfs_ncols_wf)
expected_nrows_wf = len(dfs_combine_wf) * dfs_nrows_wf
expected_ncols_wf = dfs_ncols_wf
common_cols_wf = list(set.intersection(*(set(df.columns) for df in dfs_combine_wf)))
print('\nFinding Common cols to ensure row bind is correct:', len(common_cols_wf)
, '\nCOMMON cols are:', common_cols_wf
, dfs_ncols_wf)
if len(common_cols_wf) == dfs_ncols_wf :
combined_baseline_wf = pd.concat([df[common_cols_wf] for df in dfs_combine_wf], ignore_index=False)
#resampling_methods_wf = combined_baseline_wf[['resampling']]
#resampling_methods_wf = resampling_methods_wf.drop_duplicates()
print('\nConcatenating dfs with different resampling methods [WF]:', tts_split_name
, '\nNo. of dfs combining:', len(dfs_combine_wf))
print('\n================================================^^^^^^^^^^^^')
if len(combined_baseline_wf) == expected_nrows_wf and len(combined_baseline_wf.columns) == expected_ncols_wf:
print('\n================================================^^^^^^^^^^^^')
print('\nPASS:', len(dfs_combine_wf), 'dfs successfully combined'
, '\nnrows in combined_df_wf:', len(combined_baseline_wf)
, '\nncols in combined_df_wf:', len(combined_baseline_wf.columns))
else:
print('\nFAIL: concatenating failed'
, '\nExpected nrows:', expected_nrows_wf
, '\nGot:', len(combined_baseline_wf)
, '\nExpected ncols:', expected_ncols_wf
, '\nGot:', len(combined_baseline_wf.columns))
sys.exit('\nFIRST IF FAILS')
else:
print('\nConcatenting dfs not possible [WF],check numbers ')
# TODOadd check here
combDF = pd.merge(combined_baseline_wf, metaDF, left_index = True, right_index = True)
#resampling_methods_wf = combined_baseline_wf[['resampling']]
#resampling_methods_wf = resampling_methods_wf.drop_duplicates()
#, '\n', resampling_methods_wf)
return combDF
###############################################################################