optimised run_7030.py to generate ouput from dict now that the processfunction and parameter dicts have been added

2022-06-24 15:40:18 +01:00 · 2022-06-24 15:40:18 +01:00 · b37a950fec
commit b37a950fec
parent 7dc7e25016
12 changed files with 180 additions and 128408 deletions
--- a/scripts/ml/run_7030.py
+++ b/scripts/ml/run_7030.py
@ -5,28 +5,28 @@ Created on Mon Jun 20 13:05:23 2022

@author: tanu
 """
+#%%Imports ####################################################################
 import re
 import argparse
-###############################################################################
+import os, sys
+
 # gene  = 'pncA'
 # drug  = 'pyrazinamide'
 #total_mtblineage_uc = 8
-
+###############################################################################
 #%% command line args: case sensitive
-arg_parser = argparse.ArgumentParser()
-arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '')
-arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '') 
-args = arg_parser.parse_args()
+# arg_parser = argparse.ArgumentParser()
+# arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '')
+# arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '') 
+# args = arg_parser.parse_args()

-drug    = args.drug
-gene    = args.gene
+# drug    = args.drug
+# gene    = args.gene

 ###############################################################################
-#==================
-# other vars
-#==================
-tts_split    = '70/30'
-OutFile_suffix  = '7030'
+homedir = os.path.expanduser("~")
+sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml')
+
 ###############################################################################
 #==================
 # Import data
@ -39,10 +39,15 @@ from ml_data_7030 import *
 #from UQ_yc_RunAllClfs import run_all_ML

 #====================
-# Import ML function 
+# Import ML functions 
 #====================
-# TT run all ML clfs: baseline model
-from MultModelsCl import MultModelsCl
+from MultClfs import *
+
+#==================
+# other vars
+#==================
+tts_split_7030    = '70_30'
+OutFile_suffix  = '7030'

 #==================
 # Specify outdir 
@ -51,563 +56,71 @@ outdir_ml = outdir + 'ml/tts_7030/'
 print('\nOutput directory:', outdir_ml)

 outFile_wf = outdir_ml + gene.lower() + '_baselineC_' + OutFile_suffix + '.csv'
-outFile_lf = outdir_ml + gene.lower() + '_baselineC_ext_' + OutFile_suffix + '.csv'
+#outFile_lf = outdir_ml + gene.lower() + '_baselineC_ext_' + OutFile_suffix + '.csv'

-###############################################################################
-score_type_ordermapD = { 'mcc'      : 1
-                   , 'fscore'       : 2
-                   , 'jcc'          : 3
-                   , 'precision'    : 4
-                   , 'recall'       : 5      
-                   , 'accuracy'     : 6  
-                   , 'roc_auc'      : 7
-                   , 'TN'           : 8
-                   , 'FP'           : 9
-                   , 'FN'           : 10
-                   , 'TP'           : 11  
-                   , 'trainingY_neg': 12  
-                   , 'trainingY_pos': 13    
-                   , 'blindY_neg'   : 14
-                   , 'blindY_pos'   : 15
-                   , 'fit_time'     : 16
-                   , 'score_time'   : 17
-                   }
-
-scoreCV_mapD = {'test_mcc'         : 'MCC'
-                , 'test_fscore'    : 'F1'
-                , 'test_precision' : 'Precision'
-                , 'test_recall'    : 'Recall'
-                , 'test_accuracy'  : 'Accuracy'
-                , 'test_roc_auc'   : 'ROC_AUC'
-                , 'test_jcc'       : 'JCC'
-                }
-
-scoreBT_mapD = {'bts_mcc'          : 'MCC'
-                , 'bts_fscore'     : 'F1'
-                , 'bts_precision'  : 'Precision'
-                , 'bts_recall'     : 'Recall'
-                , 'bts_accuracy'   : 'Accuracy'
-                , 'bts_roc_auc'    : 'ROC_AUC'
-                , 'bts_jcc'        : 'JCC'
-               }
-
-# data dependent variables but NOT dependent on resampling
-bts_size  = len(X_bts)
-yc2       = Counter(y_bts)
-yc2_ratio = yc2[0]/yc2[1]
-
-###############################################################################
+#%% Running models ############################################################
 print('\n#####################################################################\n'
-      , '\nRunning ML analysis: feature groups '
+      , '\nStarting--> Running ML analysis: Baseline modes (No FS)'
      , '\nGene name:', gene
-      , '\nDrug name:', drug)
+      , '\nDrug name:', drug
+      , '\n#####################################################################\n')

-#%% Basic: No Oversampling
-#================
-# Baseline
-# No resampling
-#================  
-scores_mmD = MultModelsCl(input_df = X
-                    , target = y
-                    , var_type = 'mixed'
-                    , skf_cv = skf_cv
-                    , blind_test_df = X_bts
-                    , blind_test_target = y_bts
-                    , add_cm = True 
-                    , add_yn = True)
-
-baseline_all_scores = pd.DataFrame(scores_mmD)
-rs_none = 'none'
-#------------------------
-#  WF: only CV and BTS
-#-----------------------
-baseline_allT = baseline_all_scores.T
-#baseline_train = baseline_all.filter(regex='train_', axis=1)
-
-baseline_CV = baseline_allT.filter(regex='test_', axis = 1); baseline_CV.columns
-# map colnames for consistency to allow concatenting
-baseline_CV.columns = baseline_CV.columns.map(scoreCV_mapD); baseline_CV.columns
-baseline_CV['Data_source'] = 'CV'
-baseline_CV['Resampling']  = rs_none
-
-baseline_BT = baseline_allT.filter(regex='bts_', axis = 1); baseline_BT.columns
-# map colnames for consistency to allow concatenting
-baseline_BT.columns = baseline_BT.columns.map(scoreBT_mapD); baseline_BT.columns
-baseline_BT['Data_source'] = 'BT'
-baseline_BT['Resampling'] = rs_none
-
-# # Write csv
-#baseline_CV.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
-#baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
-# baseline_CV.to_csv(outdir_ml + gene.lower() + '_baseline_CV_allF.csv')
-# baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
-
-#----------------------------------
-#  LF*: CV + BTS + Other info
-#-----------------------------------
-# other data dependent variables
-training_size_ns = len(X)
-n_features       = len(X.columns)
-yc1              = Counter(y)
-yc1_ratio        = yc1[0]/yc1[1]
-
-baseline_all = baseline_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
-baseline_all = baseline_all.reset_index()
-baseline_all.rename(columns = {'index': 'original_names'}, inplace = True)
-
-# Indicate whether BT or CV
-bt_pattern = re.compile(r'bts_.*')
-baseline_all['Data_source'] = baseline_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
-
-baseline_all['score_type'] = baseline_all['original_names'].str.replace('bts_|test_', '', regex = True)
-
-score_type_uniqueN = set(baseline_all['score_type'])
-cL1 = list(score_type_ordermapD.keys())
-cL2 = list(score_type_uniqueN)
-
-if set(cL1).issubset(cL2):
-    print('\nPASS: sorting df by score that is mapped onto the order I want')
-    baseline_all['score_order'] = baseline_all['score_type'].map(score_type_ordermapD)
-    baseline_all.sort_values(by = ['Data_source', 'score_order'], ascending = [True, True], inplace = True)
-else:
-    sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
-
-# add cols: specific
-baseline_all['Resampling']     = rs_none
-baseline_all['training_size']  = training_size_ns
-baseline_all['trainingY_ratio']= round(yc1_ratio,2)
-baseline_all['n_features']     = n_features
-
-###############################################################################
-#%% SMOTE NC: Smote Oversampling [Numerical + categorical]
-#================
-# Baseline
-# SMOTE NC: SMNC
-#================
-smnc_scores_mmD = MultModelsCl(input_df = X_smnc
-                    , target = y_smnc
-                    , var_type = 'mixed'
-                    , skf_cv = skf_cv
-                    , blind_test_df = X_bts
-                    , blind_test_target = y_bts
-                    , add_cm = True 
-                    , add_yn = True)
-
-smnc_all_scores = pd.DataFrame(smnc_scores_mmD)
-rs_smnc = 'smnc'
-#------------------------
-#  WF: only CV and BTS
-#-----------------------
-smnc_allT = smnc_all_scores.T
-
-smnc_CV = smnc_allT.filter(regex='test_', axis = 1); smnc_CV.columns
-# map colnames for consistency to allow concatenting
-smnc_CV.columns = smnc_CV.columns.map(scoreCV_mapD); smnc_CV.columns
-smnc_CV['Data_source'] = 'CV'
-smnc_CV['Resampling']  = rs_smnc
-
-smnc_BT = smnc_allT.filter(regex='bts_', axis = 1); smnc_BT.columns
-# map colnames for consistency to allow concatenting
-smnc_BT.columns = smnc_BT.columns.map(scoreBT_mapD); smnc_BT.columns
-smnc_BT['Data_source'] = 'BT'
-smnc_BT['Resampling']  = rs_smnc
-
-# Write csv
-# smnc_CV.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
-# smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
-# smnc_CV.to_csv(outdir_ml + gene.lower() + '_smnc_CV_allF.csv')
-# smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
-
-#----------------------------------
-#  LF*: CV + BTS + Other info
-#-----------------------------------
-# other data dependent variables
-training_size_smnc = len(X_smnc)
-n_features         = len(X_smnc.columns)
-yc1_smnc              = Counter(y_smnc)
-yc1_ratio_smnc        = yc1_smnc[0]/yc1_smnc[1]
-
-smnc_all = smnc_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
-smnc_all = smnc_all.reset_index()
-smnc_all.rename(columns = {'index': 'original_names'}, inplace = True)
-
-# Indicate whether BT or CV
-bt_pattern = re.compile(r'bts_.*')
-smnc_all['Data_source'] = smnc_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
-
-smnc_all['score_type'] = smnc_all['original_names'].str.replace('bts_|test_', '', regex = True)
-
-score_type_uniqueN = set(smnc_all['score_type'])
-cL1 = list(score_type_ordermapD.keys())
-cL2 = list(score_type_uniqueN)
-
-if set(cL1).issubset(cL2):
-    print('\nPASS: sorting df by score that is mapped onto the order I want')
-    smnc_all['score_order'] = smnc_all['score_type'].map(score_type_ordermapD)
-    smnc_all.sort_values(by = ['Data_source', 'score_order'], ascending = [True, True], inplace = True)
-else:
-    sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
-  
-# add cols: specific
-smnc_all['Resampling']      = rs_smnc
-smnc_all['training_size']   = training_size_smnc
-smnc_all['trainingY_ratio'] = round(yc1_ratio_smnc,2)
-smnc_all['n_features']      = n_features
-
-###############################################################################
-#%% ROS: Random Over Sampling [Numerical + categorical]
-#================
-# Baseline
-# ROS
-#================
-ros_scores_mmD = MultModelsCl(input_df = X_ros
-                    , target = y_ros
-                    , var_type = 'mixed'
-                    , skf_cv = skf_cv
-                    , blind_test_df = X_bts
-                    , blind_test_target = y_bts
-                    , add_cm = True 
-                    , add_yn = True)
-
-ros_all_scores = pd.DataFrame(ros_scores_mmD)
-rs_ros = 'ros'
-#------------------------
-#  WF: only CV and BTS
-#-----------------------
-ros_allT = ros_all_scores.T
-
-ros_CV  = ros_allT.filter(regex='test_', axis = 1); ros_CV.columns
-# map colnames for consistency to allow concatenting
-ros_CV.columns = ros_CV.columns.map(scoreCV_mapD); ros_CV.columns
-ros_CV['Data_source'] = 'CV'
-ros_CV['Resampling'] = rs_ros
-
-ros_BT  = ros_allT.filter(regex='bts_', axis = 1); ros_BT.columns
-# map colnames for consistency to allow concatenting
-ros_BT.columns = ros_BT.columns.map(scoreBT_mapD); ros_BT.columns
-ros_BT['Data_source'] = 'BT'
-ros_BT['Resampling'] = rs_ros
-
-# Write csv
-# ros_CV.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
-# ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
-# ros_CV.to_csv(outdir_ml + gene.lower() + '_ros_CV_allF.csv')
-# ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
-#----------------------------------
-#  LF*: CV + BTS + Other info
-#----------------------------------
-# other data dependent variables
-training_size_ros = len(X_ros)
-n_features        = len(X_ros.columns)
-yc1_ros             = Counter(y_ros)
-yc1_ratio_ros       = yc1_ros[0]/yc1_ros[1]
-
-ros_all = ros_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
-ros_all = ros_all.reset_index()
-ros_all.rename(columns = {'index': 'original_names'}, inplace = True)
-
-# Indicate whether BT or CV
-bt_pattern = re.compile(r'bts_.*')
-ros_all['Data_source'] = ros_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
-
-ros_all['score_type'] = ros_all['original_names'].str.replace('bts_|test_', '', regex = True)
-
-score_type_uniqueN = set(ros_all['score_type'])
-cL1 = list(score_type_ordermapD.keys())
-cL2 = list(score_type_uniqueN)
-
-if set(cL1).issubset(cL2):
-    print('\nPASS: sorting df by score that is mapped onto the order I want')
-    ros_all['score_order'] = ros_all['score_type'].map(score_type_ordermapD)
-    ros_all.sort_values(by = ['Data_source', 'score_order'], ascending = [True, True], inplace = True)
-else:
-    sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
-
-# add cols: specific
-ros_all['Resampling']      = rs_ros
-ros_all['training_size']   = training_size_ros
-ros_all['trainingY_ratio'] = round(yc1_ratio_ros,2)
-ros_all['n_features']      = n_features
-###############################################################################
-#%% RUS: Random Under Sampling [Numerical + categorical]
-#================
-# Baseline
-# RUS
-#================
-rus_scores_mmD = MultModelsCl(input_df = X_rus
-                    , target = y_rus
-                    , var_type = 'mixed'
-                    , skf_cv = skf_cv
-                    , blind_test_df = X_bts
-                    , blind_test_target = y_bts
-                    , add_cm = True 
-                    , add_yn = True)
-
-rus_all_scores = pd.DataFrame(rus_scores_mmD)
-rs_rus = 'rus'
-#-----------------------
-#  WF: only CV and BTS
-#-----------------------
-rus_allT = rus_all_scores.T
-
-rus_CV = rus_allT.filter(regex='test_', axis=1); rus_CV.columns
-# map colnames for consistency to allow concatenting
-rus_CV.columns = rus_CV.columns.map(scoreCV_mapD); rus_CV.columns
-rus_CV['Data_source'] = 'CV'
-rus_CV['Resampling'] = rs_rus
-
-rus_BT = rus_allT.filter(regex='bts_' , axis=1); rus_BT.columns
-# map colnames for consistency to allow concatenting
-rus_BT.columns = rus_BT.columns.map(scoreBT_mapD); rus_BT.columns
-rus_BT['Data_source'] = 'BT'
-rus_BT['Resampling'] = rs_rus
-
-# # Write csv
-# rus_CV.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
-# rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
-# rus_CV.to_csv(outdir_ml + gene.lower() + '_rus_CV_allF.csv')
-# rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
-
-#----------------------------------
-#  LF*: CV + BTS + Other info
-#----------------------------------
-# other data dependent variables
-training_size_rus = len(X_rus)
-n_features        = len(X_rus.columns)
-yc1_rus             = Counter(y_rus)
-yc1_ratio_rus       = yc1_rus[0]/yc1_rus[1]
-
-rus_all = rus_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
-rus_all = rus_all.reset_index()
-rus_all.rename(columns = {'index': 'original_names'}, inplace = True)
-
-# Indicate whether BT or CV
-bt_pattern = re.compile(r'bts_.*')
-rus_all['Data_source'] = rus_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
-
-rus_all['score_type'] = rus_all['original_names'].str.replace('bts_|test_', '', regex = True)
-
-score_type_uniqueN = set(rus_all['score_type'])
-cL1 = list(score_type_ordermapD.keys())
-cL2 = list(score_type_uniqueN)
-
-if set(cL1).issubset(cL2):
-    print('\nPASS: sorting df by score that is mapped onto the order I want')
-    rus_all['score_order'] = rus_all['score_type'].map(score_type_ordermapD)
-    rus_all.sort_values(by = ['Data_source', 'score_order'], ascending = [True, True], inplace = True)
-else:
-    sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
-
-# add cols: specific
-rus_all['Resampling']      = rs_rus
-rus_all['training_size']   = training_size_rus
-rus_all['trainingY_ratio'] = round(yc1_ratio_rus,2)
-rus_all['n_features']      = n_features
-
-###############################################################################
-#%% ROS+RUS Combined: [Numerical + categorical]
-#================
-# Baseline
-# ROUC
-#================
-rouC_scores_mmD = MultModelsCl(input_df = X_rouC
-                    , target = y_rouC
-                    , var_type = 'mixed'
-                    , skf_cv = skf_cv
-                    , blind_test_df = X_bts
-                    , blind_test_target = y_bts
-                    , add_cm = True 
-                    , add_yn = True)
-
-rouC_all_scores = pd.DataFrame(rouC_scores_mmD)
-rs_rouC = 'rouC'
-#-----------------------
-#  WF: only CV and BTS
-#-----------------------
-rouC_allT = rouC_all_scores.T
-
-rouC_CV = rouC_allT.filter(regex='test_', axis=1); rouC_CV.columns
-# map colnames for consistency to allow concatenting
-rouC_CV.columns = rouC_CV.columns.map(scoreCV_mapD); rouC_CV.columns
-rouC_CV['Data_source'] = 'CV'
-rouC_CV['Resampling'] = rs_rouC
-
-rouC_BT = rouC_allT.filter(regex='bts_', axis=1); rouC_BT.columns
-# map colnames for consistency to allow concatenting
-rouC_BT.columns = rouC_BT.columns.map(scoreBT_mapD); rouC_BT.columns
-rouC_BT['Data_source'] = 'BT'
-rouC_BT['Resampling']  = rs_rouC
-
-# Write csv
-# rouC_CV.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
-# rouC_CV.to_csv(outdir_ml + gene.lower() + '_rouC_CV_allF.csv')
-# rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
-# rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')
-
-#----------------------------------
-#  LF*: CV + BTS + Other info
-#----------------------------------
-# other data dependent variables
-training_size_rouC = len(X_rouC)
-n_features         = len(X_rouC.columns)
-yc1_rouC           = Counter(y_rouC)
-yc1_ratio_rouC     = yc1_rouC[0]/yc1_rouC[1]
-
-rouC_all = rouC_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
-rouC_all = rouC_all.reset_index()
-rouC_all.rename(columns = {'index': 'original_names'}, inplace = True)
-
-# Indicate whether BT or CV
-bt_pattern = re.compile(r'bts_.*')
-rouC_all['Data_source'] = rouC_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
-
-rouC_all['score_type'] = rouC_all['original_names'].str.replace('bts_|test_', '', regex = True)
-
-score_type_uniqueN = set(rouC_all['score_type'])
-cL1 = list(score_type_ordermapD.keys())
-cL2 = list(score_type_uniqueN)
-
-if set(cL1).issubset(cL2):
-    print('\nPASS: sorting df by score that is mapped onto the order I want')
-    rouC_all['score_order'] = rouC_all['score_type'].map(score_type_ordermapD)
-    rouC_all.sort_values(by = ['Data_source', 'score_order'], ascending = [True, True], inplace = True)
-else:
-    sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
-
-# add cols: specific
-rouC_all['Resampling']      = rs_rouC
-rouC_all['training_size']   = training_size_rouC
-rouC_all['trainingY_ratio'] = round(yc1_ratio_rouC,2)
-rouC_all['n_features']      = n_features
-
-###############################################################################
-#%% COMBINING all dfs: WF and LF
-# https://stackoverflow.com/questions/39862654/pandas-concat-of-multiple-data-frames-using-only-common-columns
-#%% Combine WF
-#-----------------
-# Combine WF
-#-----------------
-dfs_combine_wf = [baseline_BT, smnc_BT, ros_BT, rus_BT, rouC_BT,
-                  baseline_CV, smnc_CV, ros_CV, rus_CV, rouC_CV]
-
-dfs_nrows_wf = []
-for df in dfs_combine_wf:
-    dfs_nrows_wf = dfs_nrows_wf + [len(df)]
-dfs_nrows_wf = max(dfs_nrows_wf)
+paramD = {
+        'baseline_paramD': { 'input_df'        : X
+                            , 'target'         : y
+                            , 'var_type'       : 'mixed'
+                            , 'resampling_type': 'none'}
+        
+        , 'smnc_paramD': { 'input_df'          : X_smnc
+                          , 'target'           : y_smnc
+                          , 'var_type'         : 'mixed'
+                          , 'resampling_type'  : 'smnc'}
    
-dfs_ncols_wf = []
-for df in dfs_combine_wf:
-    dfs_ncols_wf = dfs_ncols_wf + [len(df.columns)]
-dfs_ncols_wf = max(dfs_ncols_wf)
+        , 'ros_paramD': { 'input_df'           : X_ros
+                        , 'target'             : y_ros
+                        , 'var_type'           : 'mixed'
+                        , 'resampling_type'    : 'ros'}

-expected_nrows_wf = len(dfs_combine_wf) * dfs_nrows_wf
-expected_ncols_wf = dfs_ncols_wf
+        , 'rus_paramD' : { 'input_df'          : X_rus
+                          , 'target'           : y_rus
+                          , 'var_type'         : 'mixed'
+                          , 'resampling_type'  : 'rus'}

-common_cols_wf = list(set.intersection(*(set(df.columns) for df in dfs_combine_wf)))
+        , 'rouC_paramD' : { 'input_df'         : X_rouC
+                            , 'target'          : y_rouC
+                            , 'var_type'        : 'mixed'
+                            , 'resampling_type' : 'rouC'}
+        }

-if len(common_cols_wf) == dfs_ncols_wf :
-    combined_baseline_wf = pd.concat([df[common_cols_wf] for df in dfs_combine_wf], ignore_index=False)
-    resampling_methods_wf = combined_baseline_wf[['Resampling']]
-    resampling_methods_wf = resampling_methods_wf.drop_duplicates()
-    print('\nConcatenating dfs with different resampling methods [WF]:', tts_split
-          , '\nNo. of dfs combining:', len(dfs_combine_wf)
-          , '\nThe sampling methods are:'
-          , '\n', resampling_methods_wf)
-    if len(combined_baseline_wf) == expected_nrows_wf  and len(combined_baseline_wf.columns) == expected_ncols_wf:
-        print('\nPASS:', len(dfs_combine_wf), 'dfs successfully combined'
-              , '\nnrows in combined_df_wf:', len(combined_baseline_wf)
-              , '\nncols in combined_df_wf:', len(combined_baseline_wf.columns))
-    else:
-        print('\nFAIL: concatenating failed'
-              , '\nExpected nrows:', expected_nrows_wf
-              , '\nGot:', len(combined_baseline_wf)
-              , '\nExpected ncols:', expected_ncols_wf
-              , '\nGot:', len(combined_baseline_wf.columns))
-        sys.exit()
-else:
-    sys.exit('\nConcatenting dfs not possible [WF],check numbers ')
+# Initial run to get the dict containing CV, BT and metadata DFs 
+mmD = {}
+for k, v in paramD.items():
+#    print(mmD[k])
+    scores_7030D = MultModelsCl(**paramD[k]
+                        , tts_split_type = tts_split_7030
+                        , skf_cv = skf_cv
+                        , blind_test_df = X_bts
+                        , blind_test_target = y_bts
+                        , add_cm = True 
+                        , add_yn = True
+                        , return_formatted_output = True)
+    mmD[k] = scores_7030D
+
+# Extracting the dfs from within the dict and concatenating to output as one df
+for k, v in mmD.items():
+    out_wf_7030 = pd.concat(mmD, ignore_index = True)
    
-# Add index as a column
-combined_baseline_wf.columns
-combined_baseline_wf = combined_baseline_wf.reset_index()
-combined_baseline_wf.rename(columns = {'index': 'Model_name'}, inplace = True)
-combined_baseline_wf.head()
-
-# sort df: Resampling, Data_source, and MCC
-combined_baseline_wf.sort_values(by = ['Resampling', 'Data_source', 'MCC'], ascending = [True, True, False], inplace = True)
-##############################################################################
-#%% Combine LF
-#-----------------
-# Combine LF*
-#-----------------
-dfs_combine = [baseline_all, smnc_all, ros_all, rus_all, rouC_all ]
-              
-dfs_nrows = []
-for df in dfs_combine:
-    dfs_nrows = dfs_nrows + [len(df)]
-dfs_nrows = max(dfs_nrows)
-    
-dfs_ncols = []
-for df in dfs_combine:
-    dfs_ncols = dfs_ncols + [len(df.columns)]
-dfs_ncols = max(dfs_ncols)
-           
-# dfs_ncols = []
-# dfs_ncols2 = mode(dfs_ncols.append(len(df.columns) for df in dfs_combine)
-# dfs_ncols2
-
-expected_nrows = len(dfs_combine) * dfs_nrows
-expected_ncols = dfs_ncols
-
-common_cols = list(set.intersection(*(set(df.columns) for df in dfs_combine)))
-
-if len(common_cols) == dfs_ncols :
-    combined_baseline = pd.concat([df[common_cols] for df in dfs_combine], ignore_index=True)
-    resampling_methods = combined_baseline[['Resampling', 'training_size']]
-    resampling_methods = resampling_methods.drop_duplicates()
-    print('\nConcatenating dfs with different resampling methods:', tts_split
-          , '\nNo. of dfs combining:', len(dfs_combine)
-          , '\nThe sampling methods are:'
-          , '\n', resampling_methods)
-    if len(combined_baseline) == expected_nrows  and len(combined_baseline.columns) == expected_ncols:
-        print('\nPASS:', len(dfs_combine), 'dfs successfully combined'
-              , '\nnrows in combined_df:', len(combined_baseline)
-              , '\nncols in combined_df:', len(combined_baseline.columns))
-    else:
-        print('\nFAIL: concatenating failed'
-              , '\nExpected nrows:', expected_nrows
-              , '\nGot:', len(combined_baseline)
-              , '\nExpected ncols:', expected_ncols
-              , '\nGot:', len(combined_baseline.columns))
-        sys.exit()
-else:
-    sys.exit('\nConcatenting dfs not possible,check numbers ')
-    
-# Add further column indications
-combined_baseline['test_size']   = bts_size
-combined_baseline['tts_split']   = tts_split
-combined_baseline['testY_ratio'] = round(yc2_ratio,2)
-#combined_baseline.columns
-
-# change to column names to be lower case for consistency
-combined_baseline.rename(columns = {'Resampling'   : 'resampling'
-                                    , 'Data_source': 'data_source'}, inplace = True)
-combined_baseline.columns
-
-# sort df: resampling, data_source, mcc
-combined_baseline.sort_values(by = ['resampling', 'data_source', 'score_order'], ascending = [True, True, True], inplace = True)
-
-# # rpow bind 
-# if all(XXX):
-#     print('\nPASS:colnames match, proceeding to rowbind')
-#     comb_df = pd.concat()], axis = 0, ignore_index = True ) combined_baseline
+print('\n######################################################################'
+      , '\nEnd--> Successfully generated output DF for Multiple classifiers (baseline models)'
+      , '\nGene:', gene.lower()
+      , '\nDrug:', drug
+      , '\noutput file:', outFile_wf
+      , '\nDim of output:', out_wf_7030.shape
+      , '\n######################################################################')
 ###############################################################################
 #====================
 # Write output file
 #====================
-combined_baseline_wf.to_csv(outFile_wf, index = False)
+#out_wf_7030.to_csv(outFile_wf, index = False)
 print('\nFile successfully written:', outFile_wf)
-
-combined_baseline.to_csv(outFile_lf, index = False)
-print('\nFile successfully written:', outFile_lf)
 ###############################################################################