removed the two functions MultModelsCl.py and ProcessMultModelsCl.py as these have now been combined
This commit is contained in:
parent
ad99efedd7
commit
19da36842b
2 changed files with 63 additions and 29 deletions
|
@ -105,6 +105,10 @@ jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
|
||||||
from ProcessMultModelsCl import *
|
from ProcessMultModelsCl import *
|
||||||
|
|
||||||
#%%
|
#%%
|
||||||
|
############################
|
||||||
|
# MultModelsCl()
|
||||||
|
# Run Multiple Classifiers
|
||||||
|
############################
|
||||||
# Multiple Classification - Model Pipeline
|
# Multiple Classification - Model Pipeline
|
||||||
def MultModelsCl(input_df, target, skf_cv
|
def MultModelsCl(input_df, target, skf_cv
|
||||||
, blind_test_df
|
, blind_test_df
|
||||||
|
@ -340,17 +344,17 @@ def MultModelsCl(input_df, target, skf_cv
|
||||||
mm_skf_scoresD[model_name]['n_training_size'] = len(input_df)
|
mm_skf_scoresD[model_name]['n_training_size'] = len(input_df)
|
||||||
mm_skf_scoresD[model_name]['n_trainingY_ratio'] = round(yc1_ratio, 2)
|
mm_skf_scoresD[model_name]['n_trainingY_ratio'] = round(yc1_ratio, 2)
|
||||||
|
|
||||||
mm_skf_scoresD[model_name]['n_blind_test_size'] = len(blind_test_df)
|
mm_skf_scoresD[model_name]['n_test_size'] = len(blind_test_df)
|
||||||
mm_skf_scoresD[model_name]['n_testY_ratio'] = round(yc2_ratio,2)
|
mm_skf_scoresD[model_name]['n_testY_ratio'] = round(yc2_ratio,2)
|
||||||
mm_skf_scoresD[model_name]['n_features'] = len(input_df.columns)
|
mm_skf_scoresD[model_name]['n_features'] = len(input_df.columns)
|
||||||
mm_skf_scoresD[model_name]['tts_split'] = tts_split_type
|
mm_skf_scoresD[model_name]['tts_split'] = tts_split_type
|
||||||
|
|
||||||
#return(mm_skf_scoresD)
|
#return(mm_skf_scoresD)
|
||||||
#============================
|
#============================
|
||||||
# Process the dict to have WF
|
# Process the dict to have WF
|
||||||
#============================
|
#============================
|
||||||
if return_formatted_output:
|
if return_formatted_output:
|
||||||
CV_BT_metaDF = ProcessMultModelCl(mm_skf_scoresD)
|
CV_BT_metaDF = ProcessMultModelsCl(mm_skf_scoresD)
|
||||||
return(CV_BT_metaDF)
|
return(CV_BT_metaDF)
|
||||||
else:
|
else:
|
||||||
return(mm_skf_scoresD)
|
return(mm_skf_scoresD)
|
|
@ -10,7 +10,12 @@ import pandas as pd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import re
|
import re
|
||||||
##############################################################################
|
##############################################################################
|
||||||
#%% FUNCTION: Process outout dicr from MultModelsCl
|
#%% FUNCTION: Process output dict from MultModelsCl
|
||||||
|
############################
|
||||||
|
# ProcessMultModelsCl()
|
||||||
|
############################
|
||||||
|
#Processes the dict from above if use_formatted_output = True
|
||||||
|
|
||||||
def ProcessMultModelsCl(inputD = {}):
|
def ProcessMultModelsCl(inputD = {}):
|
||||||
|
|
||||||
scoresDF = pd.DataFrame(inputD)
|
scoresDF = pd.DataFrame(inputD)
|
||||||
|
@ -25,43 +30,52 @@ def ProcessMultModelsCl(inputD = {}):
|
||||||
if len(set(tts_split_nameL)) == 1:
|
if len(set(tts_split_nameL)) == 1:
|
||||||
tts_split_name = str(list(set(tts_split_nameL))[0])
|
tts_split_name = str(list(set(tts_split_nameL))[0])
|
||||||
print('\nExtracting tts_split_name:', tts_split_name)
|
print('\nExtracting tts_split_name:', tts_split_name)
|
||||||
|
|
||||||
#------------------------
|
#------------------------
|
||||||
# WF: only CV and BTS
|
# WF: only CV and BTS
|
||||||
#-----------------------
|
#-----------------------
|
||||||
scoresDFT = scoresDF.T
|
scoresDFT = scoresDF.T
|
||||||
|
|
||||||
scoresDF_CV = scoresDFT.filter(regex='test_', axis = 1); scoresDF_CV.columns
|
scoresDF_CV = scoresDFT.filter(regex='^test_.*$', axis = 1); scoresDF_CV.columns
|
||||||
# map colnames for consistency to allow concatenting
|
# map colnames for consistency to allow concatenting
|
||||||
scoresDF_CV.columns = scoresDF_CV.columns.map(scoreCV_mapD); scoresDF_CV.columns
|
scoresDF_CV.columns = scoresDF_CV.columns.map(scoreCV_mapD); scoresDF_CV.columns
|
||||||
scoresDF_CV['Data_source'] = 'CV'
|
scoresDF_CV['source_data'] = 'CV'
|
||||||
|
|
||||||
scoresDF_BT = scoresDFT.filter(regex='bts_', axis = 1); scoresDF_BT.columns
|
scoresDF_BT = scoresDFT.filter(regex='^bts_.*$', axis = 1); scoresDF_BT.columns
|
||||||
# map colnames for consistency to allow concatenting
|
# map colnames for consistency to allow concatenting
|
||||||
scoresDF_BT.columns = scoresDF_BT.columns.map(scoreBT_mapD); scoresDF_BT.columns
|
scoresDF_BT.columns = scoresDF_BT.columns.map(scoreBT_mapD); scoresDF_BT.columns
|
||||||
scoresDF_BT['Data_source'] = 'BT'
|
scoresDF_BT['source_data'] = 'BT'
|
||||||
|
|
||||||
# dfs_combine_wf = [baseline_BT, smnc_BT, ros_BT, rus_BT, rouC_BT,
|
# dfs_combine_wf = [baseline_BT, smnc_BT, ros_BT, rus_BT, rouC_BT,
|
||||||
# baseline_CV, smnc_CV, ros_CV, rus_CV, rouC_CV]
|
# baseline_CV, smnc_CV, ros_CV, rus_CV, rouC_CV]
|
||||||
|
|
||||||
#baseline_all = baseline_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
|
#baseline_all = baseline_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
|
||||||
|
|
||||||
metaDF = scoresDFT.filter(regex='training_size|blind_test_size|_time|TN|FP|FN|TP|.*_neg|.*_pos|resampling', axis = 1); scoresDF_BT.columns
|
#metaDF = scoresDFT.filter(regex='training_size|blind_test_size|_time|TN|FP|FN|TP|.*_neg|.*_pos|resampling', axis = 1); scoresDF_BT.columns
|
||||||
|
#metaDF = scoresDFT.filter(regex='n_.*$|_time|TN|FP|FN|TP|.*_neg|.*_pos|resampling|tts.*', axis = 1); metaDF.columns
|
||||||
|
metaDF = scoresDFT.filter(regex='^(?!test_.*$|bts_.*$|train_.*$).*'); metaDF.columns
|
||||||
|
|
||||||
|
print('\nTotal cols in each df:'
|
||||||
|
, '\nCV df:', len(scoresDF_CV.columns)
|
||||||
|
, '\nBT_df:', len(scoresDF_BT.columns)
|
||||||
|
, '\nmetaDF:', len(metaDF.columns))
|
||||||
|
if len(scoresDF_CV.columns) == len(scoresDF_BT.columns):
|
||||||
|
print('\nFirst proceeding to rowbind CV and BT dfs:')
|
||||||
|
expected_ncols_out = len(scoresDF_BT.columns) + len(metaDF.columns)
|
||||||
|
print('\nFinal output should have:',expected_ncols_out, 'columns' )
|
||||||
|
|
||||||
#-----------------
|
#-----------------
|
||||||
# Combine WF
|
# Combine WF
|
||||||
#-----------------
|
#-----------------
|
||||||
dfs_combine_wf = [scoresDF_CV, scoresDF_BT]
|
dfs_combine_wf = [scoresDF_CV, scoresDF_BT]
|
||||||
print('\n---------->\n', len(dfs_combine_wf))
|
|
||||||
print(scoresDF_CV)
|
|
||||||
print(scoresDF_BT)
|
|
||||||
|
|
||||||
print('\nCombinig', len(dfs_combine_wf), 'using pd.concat by row ~ rowbind'
|
print('\nCombinig', len(dfs_combine_wf), 'using pd.concat by row ~ rowbind'
|
||||||
, '\nChecking Dims of df to combine:'
|
, '\nChecking Dims of df to combine:'
|
||||||
, '\nDim of CV:', scoresDF_CV.shape
|
, '\nDim of CV:', scoresDF_CV.shape
|
||||||
, '\nDim of BT:', scoresDF_BT.shape)
|
, '\nDim of BT:', scoresDF_BT.shape)
|
||||||
|
#print(scoresDF_CV)
|
||||||
|
#print(scoresDF_BT)
|
||||||
|
|
||||||
|
|
||||||
dfs_nrows_wf = []
|
dfs_nrows_wf = []
|
||||||
for df in dfs_combine_wf:
|
for df in dfs_combine_wf:
|
||||||
dfs_nrows_wf = dfs_nrows_wf + [len(df)]
|
dfs_nrows_wf = dfs_nrows_wf + [len(df)]
|
||||||
|
@ -77,19 +91,17 @@ def ProcessMultModelsCl(inputD = {}):
|
||||||
expected_ncols_wf = dfs_ncols_wf
|
expected_ncols_wf = dfs_ncols_wf
|
||||||
|
|
||||||
common_cols_wf = list(set.intersection(*(set(df.columns) for df in dfs_combine_wf)))
|
common_cols_wf = list(set.intersection(*(set(df.columns) for df in dfs_combine_wf)))
|
||||||
print('\nFinding Common cols to ensure row bind is correct:', len(common_cols_wf)
|
print('\nNumber of Common columns:', dfs_ncols_wf
|
||||||
, '\nCOMMON cols are:', common_cols_wf
|
, '\nThese are:', common_cols_wf)
|
||||||
, dfs_ncols_wf)
|
|
||||||
|
|
||||||
if len(common_cols_wf) == dfs_ncols_wf :
|
if len(common_cols_wf) == dfs_ncols_wf :
|
||||||
combined_baseline_wf = pd.concat([df[common_cols_wf] for df in dfs_combine_wf], ignore_index=False)
|
combined_baseline_wf = pd.concat([df[common_cols_wf] for df in dfs_combine_wf], ignore_index=False)
|
||||||
#resampling_methods_wf = combined_baseline_wf[['resampling']]
|
print('\nConcatenating dfs with different resampling methods [WF]:'
|
||||||
#resampling_methods_wf = resampling_methods_wf.drop_duplicates()
|
, '\nSplit type:', tts_split_name
|
||||||
print('\nConcatenating dfs with different resampling methods [WF]:', tts_split_name
|
|
||||||
, '\nNo. of dfs combining:', len(dfs_combine_wf))
|
, '\nNo. of dfs combining:', len(dfs_combine_wf))
|
||||||
print('\n================================================^^^^^^^^^^^^')
|
#print('\n================================================^^^^^^^^^^^^')
|
||||||
if len(combined_baseline_wf) == expected_nrows_wf and len(combined_baseline_wf.columns) == expected_ncols_wf:
|
if len(combined_baseline_wf) == expected_nrows_wf and len(combined_baseline_wf.columns) == expected_ncols_wf:
|
||||||
print('\n================================================^^^^^^^^^^^^')
|
#print('\n================================================^^^^^^^^^^^^')
|
||||||
|
|
||||||
print('\nPASS:', len(dfs_combine_wf), 'dfs successfully combined'
|
print('\nPASS:', len(dfs_combine_wf), 'dfs successfully combined'
|
||||||
, '\nnrows in combined_df_wf:', len(combined_baseline_wf)
|
, '\nnrows in combined_df_wf:', len(combined_baseline_wf)
|
||||||
|
@ -105,12 +117,30 @@ def ProcessMultModelsCl(inputD = {}):
|
||||||
print('\nConcatenting dfs not possible [WF],check numbers ')
|
print('\nConcatenting dfs not possible [WF],check numbers ')
|
||||||
|
|
||||||
|
|
||||||
# TODOadd check here
|
#-------------------------------------
|
||||||
combDF = pd.merge(combined_baseline_wf, metaDF, left_index = True, right_index = True)
|
# Combine WF+Metadata: Final output
|
||||||
|
#-------------------------------------
|
||||||
|
# checking indices for the dfs to combine:
|
||||||
|
c1 = list(set(combined_baseline_wf.index))
|
||||||
|
c2 = list(metaDF.index)
|
||||||
|
|
||||||
|
if c1 == c2:
|
||||||
|
print('\nPASS: proceeding to merge metadata with CV and BT dfs')
|
||||||
|
combDF = pd.merge(combined_baseline_wf, metaDF, left_index = True, right_index = True)
|
||||||
|
else:
|
||||||
|
sys.exit('\nFAIL: Could not merge metadata with CV and BT dfs')
|
||||||
|
|
||||||
|
if len(combDF.columns) == expected_ncols_out:
|
||||||
|
print('\nPASS: Combined df has expected ncols')
|
||||||
|
else:
|
||||||
|
sys.exit('\nFAIL: Length mismatch for combined_df')
|
||||||
|
|
||||||
|
print('\n========================================================='
|
||||||
|
, '\nSUCCESS: Ran multiple classifiers'
|
||||||
|
, '\n=======================================================')
|
||||||
|
|
||||||
#resampling_methods_wf = combined_baseline_wf[['resampling']]
|
#resampling_methods_wf = combined_baseline_wf[['resampling']]
|
||||||
#resampling_methods_wf = resampling_methods_wf.drop_duplicates()
|
#resampling_methods_wf = resampling_methods_wf.drop_duplicates()
|
||||||
#, '\n', resampling_methods_wf)
|
#, '\n', resampling_methods_wf)
|
||||||
|
|
||||||
return combDF
|
return combDF
|
||||||
|
|
||||||
###############################################################################
|
|
Loading…
Add table
Add a link
Reference in a new issue