optimised run_7030.py to generate ouput from dict now that the processfunction and parameter dicts have been added

2022-06-24 15:40:18 +01:00 · 2022-06-24 15:40:18 +01:00 · b37a950fec
commit b37a950fec
parent 7dc7e25016
12 changed files with 180 additions and 128408 deletions
--- a/scripts/ml/MultClfs.py
+++ b/scripts/ml/MultClfs.py
@ -197,35 +197,35 @@ def MultModelsCl(input_df, target, skf_cv
    # Specify multiple Classification Models  
    #======================================================
    models = [('AdaBoost Classifier'     , AdaBoostClassifier(**rs) )
-            #  , ('Bagging Classifier'        , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True) )
+               , ('Bagging Classifier'        , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True) )
-            #   , ('Decision Tree'             , DecisionTreeClassifier(**rs) ) 
+               , ('Decision Tree'             , DecisionTreeClassifier(**rs) ) 
-            #   , ('Extra Tree'                , ExtraTreeClassifier(**rs) )
+               , ('Extra Tree'                , ExtraTreeClassifier(**rs) )
-            #   , ('Extra Trees'               , ExtraTreesClassifier(**rs) ) 
+               , ('Extra Trees'               , ExtraTreesClassifier(**rs) ) 
-            #   , ('Gradient Boosting'         , GradientBoostingClassifier(**rs) )
+               , ('Gradient Boosting'         , GradientBoostingClassifier(**rs) )
-            #   , ('Gaussian NB'               , GaussianNB() )
+               , ('Gaussian NB'               , GaussianNB() )
-            #   , ('Gaussian Process'          , GaussianProcessClassifier(**rs) )
+               , ('Gaussian Process'          , GaussianProcessClassifier(**rs) )
-            #   , ('K-Nearest Neighbors'       , KNeighborsClassifier() ) 
+               , ('K-Nearest Neighbors'       , KNeighborsClassifier() ) 
-            #   , ('LDA'                       , LinearDiscriminantAnalysis() )
+               , ('LDA'                       , LinearDiscriminantAnalysis() )
               , ('Logistic Regression'       , LogisticRegression(**rs) )
-            #   , ('Logistic RegressionCV'     , LogisticRegressionCV(cv = 3, **rs))
+               , ('Logistic RegressionCV'     , LogisticRegressionCV(cv = 3, **rs))
-            #   , ('MLP'                       , MLPClassifier(max_iter = 500, **rs) ) 
+               , ('MLP'                       , MLPClassifier(max_iter = 500, **rs) ) 
-            #   , ('Multinomial'               , MultinomialNB() )
+               , ('Multinomial'               , MultinomialNB() )
-            #   , ('Naive Bayes'               , BernoulliNB() )
+               , ('Naive Bayes'               , BernoulliNB() )
-            #   , ('Passive Aggresive'         , PassiveAggressiveClassifier(**rs, **njobs) )
+               , ('Passive Aggresive'         , PassiveAggressiveClassifier(**rs, **njobs) )
-            #   , ('QDA'                       , QuadraticDiscriminantAnalysis() )
+               , ('QDA'                       , QuadraticDiscriminantAnalysis() )
-            #   , ('Random Forest'             , RandomForestClassifier(**rs, n_estimators = 1000 ) ) 
+               , ('Random Forest'             , RandomForestClassifier(**rs, n_estimators = 1000 ) ) 
-            #    , ('Random Forest2'            , RandomForestClassifier(min_samples_leaf = 5
+               , ('Random Forest2'            , RandomForestClassifier(min_samples_leaf = 5
-            #                                                          , n_estimators     = 1000
+                                                                       , n_estimators     = 1000
-            #                                                          , bootstrap        = True
+                                                                       , bootstrap        = True
-            #                                                          , oob_score        = True
+                                                                       , oob_score        = True
-            #                                                          , **njobs
+                                                                       , **njobs
-            #                                                          , **rs
+                                                                       , **rs
-            #                                                          , max_features     = 'auto') ) 
+                                                                       , max_features     = 'auto') ) 
-            #   , ('Ridge Classifier'          , RidgeClassifier(**rs)  )
+                , ('Ridge Classifier'          , RidgeClassifier(**rs)  )
-            #   , ('Ridge ClassifierCV'        , RidgeClassifierCV(cv = 3) )          
+                , ('Ridge ClassifierCV'        , RidgeClassifierCV(cv = 3) )          
-            #   , ('SVC'                       , SVC(**rs) ) 
+                , ('SVC'                       , SVC(**rs) ) 
-            #   , ('Stochastic GDescent'       , SGDClassifier(**rs, **njobs) )
+                , ('Stochastic GDescent'       , SGDClassifier(**rs, **njobs) )
-            #   , ('XGBoost'                   , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False) )
+                , ('XGBoost'                   , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False) )
             ]
    mm_skf_scoresD = {}
@ -440,10 +440,11 @@ def ProcessMultModelsCl(inputD = {}):
          , '\nCV df:', len(scoresDF_CV.columns)
          , '\nBT_df:', len(scoresDF_BT.columns)
          , '\nmetaDF:', len(metaDF.columns))
    if  len(scoresDF_CV.columns) == len(scoresDF_BT.columns):
        print('\nFirst proceeding to rowbind CV and BT dfs:')
        expected_ncols_out = len(scoresDF_BT.columns) + len(metaDF.columns)
-        print('\nFinal output should have:',expected_ncols_out, 'columns' )
+        print('\nFinal output should have:', expected_ncols_out, 'columns' )
    #-----------------
    # Combine WF
@ -496,8 +497,7 @@ def ProcessMultModelsCl(inputD = {}):
            sys.exit('\nFIRST IF FAILS')
    else:
        print('\nConcatenting dfs not possible [WF],check numbers ')    
-    
+
    #-------------------------------------
    # Combine WF+Metadata: Final output
    #-------------------------------------
@ -515,11 +515,15 @@ def ProcessMultModelsCl(inputD = {}):
        print('\nPASS: Combined df has expected ncols')
    else:
        sys.exit('\nFAIL: Length mismatch for combined_df')
    print('\nAdding column: Model_name')
    combDF['Model_name'] = combDF.index
    print('\n========================================================='
          , '\nSUCCESS: Ran multiple classifiers'
          , '\n=======================================================')
-        
+
    #resampling_methods_wf = combined_baseline_wf[['resampling']]
    #resampling_methods_wf = resampling_methods_wf.drop_duplicates()
              #, '\n', resampling_methods_wf)
--- a/scripts/ml/log_alr_7030.txt
+++ b/scripts/ml/log_alr_7030.txt
@ -1,72 +0,0 @@
 /home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_7030.py:464: SettingWithCopyWarning: 
 A value is trying to be set on a copy of a slice from a DataFrame
 See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
 1.22.4
 1.4.1
 aaindex_df contains non-numerical data
 Total no. of non-numerial columns: 2
 Selecting numerical data only
 PASS: successfully selected numerical columns only for aaindex_df
 Now checking for NA in the remaining aaindex_cols
 Counting aaindex_df cols with NA 
 ncols with NA: 4 columns 
 Dropping these... 
 Original ncols: 127
 Revised df ncols: 123
 Checking NA in revised df...
 PASS: cols with NA successfully dropped from aaindex_df 
 Proceeding with combining aa_df with other features_df
 PASS: ncols match 
 Expected ncols: 123 
 Got: 123
 Total no. of columns in clean aa_df: 123
 Proceeding to merge, expected nrows in merged_df: 271
 PASS: my_features_df and aa_df successfully combined 
 nrows: 271 
 ncols: 269
 count of NULL values before imputation
 or_mychisq          256
 log10_or_mychisq    256
 dtype: int64
 count of NULL values AFTER imputation
 mutationinformation    0
 or_rawI                0
 logorI                 0
 dtype: int64
 PASS: OR values imputed, data ready for ML
 Total no. of features for aaindex: 123
 PASS: x_features has no target variable
 No. of columns for x_features: 174
 Traceback (most recent call last):
  File "/home/tanu/git/LSHTM_analysis/scripts/ml/./run_7030.py", line 35, in <module>
    setvars(gene,drug)
  File "/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_7030.py", line 636, in setvars
    X, X_bts, y, y_bts = train_test_split(x_features, y_target
  File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/sklearn/model_selection/_split.py", line 2454, in train_test_split
    train, test = next(cv.split(X=arrays[0], y=stratify))
  File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/sklearn/model_selection/_split.py", line 1613, in split
    for train, test in self._iter_indices(X, y, groups):
  File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/sklearn/model_selection/_split.py", line 1953, in _iter_indices
    raise ValueError(
 ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.
--- a/scripts/ml/log_embb_7030.txt
+++ b/scripts/ml/log_embb_7030.txt
--- a/scripts/ml/log_gid_7030.txt
+++ b/scripts/ml/log_gid_7030.txt
--- a/scripts/ml/log_katg_7030.txt
+++ b/scripts/ml/log_katg_7030.txt
--- a/scripts/ml/log_pnca_7030.txt
+++ b/scripts/ml/log_pnca_7030.txt
--- a/scripts/ml/log_rpob_7030.txt
+++ b/scripts/ml/log_rpob_7030.txt
--- a/scripts/ml/ml_data_7030.py
+++ b/scripts/ml/ml_data_7030.py
@ -557,7 +557,7 @@ def setvars(gene,drug):
    # FG5: Genomic features
    #========================
    X_gn_mafor_Fnum =  ['maf'
-                    , 'logorI'
+                    #, 'logorI'
                    # , 'or_rawI'
                    # , 'or_mychisq'
                    # , 'or_logistic'
--- a/scripts/ml/run_7030.py
+++ b/scripts/ml/run_7030.py
@ -5,28 +5,28 @@ Created on Mon Jun 20 13:05:23 2022
@author: tanu
 """
 #%%Imports ####################################################################
 import re
 import argparse
-###############################################################################
+import os, sys
 # gene  = 'pncA'
 # drug  = 'pyrazinamide'
 #total_mtblineage_uc = 8
-
+###############################################################################
 #%% command line args: case sensitive
-arg_parser = argparse.ArgumentParser()
+# arg_parser = argparse.ArgumentParser()
-arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '')
+# arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '')
-arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '') 
+# arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '') 
-args = arg_parser.parse_args()
+# args = arg_parser.parse_args()
-drug    = args.drug
+# drug    = args.drug
-gene    = args.gene
+# gene    = args.gene
 ###############################################################################
-#==================
+homedir = os.path.expanduser("~")
-# other vars
+sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml')
-#==================
+
 tts_split    = '70/30'
 OutFile_suffix  = '7030'
 ###############################################################################
 #==================
 # Import data
@ -39,10 +39,15 @@ from ml_data_7030 import *
 #from UQ_yc_RunAllClfs import run_all_ML
 #====================
-# Import ML function 
+# Import ML functions 
 #====================
-# TT run all ML clfs: baseline model
+from MultClfs import *
-from MultModelsCl import MultModelsCl
+
 #==================
 # other vars
 #==================
 tts_split_7030    = '70_30'
 OutFile_suffix  = '7030'
 #==================
 # Specify outdir 
@ -51,563 +56,71 @@ outdir_ml = outdir + 'ml/tts_7030/'
 print('\nOutput directory:', outdir_ml)
 outFile_wf = outdir_ml + gene.lower() + '_baselineC_' + OutFile_suffix + '.csv'
-outFile_lf = outdir_ml + gene.lower() + '_baselineC_ext_' + OutFile_suffix + '.csv'
+#outFile_lf = outdir_ml + gene.lower() + '_baselineC_ext_' + OutFile_suffix + '.csv'
-###############################################################################
+#%% Running models ############################################################
 score_type_ordermapD = { 'mcc'      : 1
                   , 'fscore'       : 2
                   , 'jcc'          : 3
                   , 'precision'    : 4
                   , 'recall'       : 5      
                   , 'accuracy'     : 6  
                   , 'roc_auc'      : 7
                   , 'TN'           : 8
                   , 'FP'           : 9
                   , 'FN'           : 10
                   , 'TP'           : 11  
                   , 'trainingY_neg': 12  
                   , 'trainingY_pos': 13    
                   , 'blindY_neg'   : 14
                   , 'blindY_pos'   : 15
                   , 'fit_time'     : 16
                   , 'score_time'   : 17
                   }
 scoreCV_mapD = {'test_mcc'         : 'MCC'
                , 'test_fscore'    : 'F1'
                , 'test_precision' : 'Precision'
                , 'test_recall'    : 'Recall'
                , 'test_accuracy'  : 'Accuracy'
                , 'test_roc_auc'   : 'ROC_AUC'
                , 'test_jcc'       : 'JCC'
                }
 scoreBT_mapD = {'bts_mcc'          : 'MCC'
                , 'bts_fscore'     : 'F1'
                , 'bts_precision'  : 'Precision'
                , 'bts_recall'     : 'Recall'
                , 'bts_accuracy'   : 'Accuracy'
                , 'bts_roc_auc'    : 'ROC_AUC'
                , 'bts_jcc'        : 'JCC'
               }
 # data dependent variables but NOT dependent on resampling
 bts_size  = len(X_bts)
 yc2       = Counter(y_bts)
 yc2_ratio = yc2[0]/yc2[1]
 ###############################################################################
 print('\n#####################################################################\n'
-      , '\nRunning ML analysis: feature groups '
+      , '\nStarting--> Running ML analysis: Baseline modes (No FS)'
      , '\nGene name:', gene
-      , '\nDrug name:', drug)
+      , '\nDrug name:', drug
      , '\n#####################################################################\n')
-#%% Basic: No Oversampling
+paramD = {
-#================
+        'baseline_paramD': { 'input_df'        : X
-# Baseline
+                            , 'target'         : y
-# No resampling
+                            , 'var_type'       : 'mixed'
-#================  
+                            , 'resampling_type': 'none'}
-scores_mmD = MultModelsCl(input_df = X
+        
-                    , target = y
+        , 'smnc_paramD': { 'input_df'          : X_smnc
-                    , var_type = 'mixed'
+                          , 'target'           : y_smnc
-                    , skf_cv = skf_cv
+                          , 'var_type'         : 'mixed'
-                    , blind_test_df = X_bts
+                          , 'resampling_type'  : 'smnc'}
                    , blind_test_target = y_bts
                    , add_cm = True 
                    , add_yn = True)
 baseline_all_scores = pd.DataFrame(scores_mmD)
 rs_none = 'none'
 #------------------------
 #  WF: only CV and BTS
 #-----------------------
 baseline_allT = baseline_all_scores.T
 #baseline_train = baseline_all.filter(regex='train_', axis=1)
 baseline_CV = baseline_allT.filter(regex='test_', axis = 1); baseline_CV.columns
 # map colnames for consistency to allow concatenting
 baseline_CV.columns = baseline_CV.columns.map(scoreCV_mapD); baseline_CV.columns
 baseline_CV['Data_source'] = 'CV'
 baseline_CV['Resampling']  = rs_none
 baseline_BT = baseline_allT.filter(regex='bts_', axis = 1); baseline_BT.columns
 # map colnames for consistency to allow concatenting
 baseline_BT.columns = baseline_BT.columns.map(scoreBT_mapD); baseline_BT.columns
 baseline_BT['Data_source'] = 'BT'
 baseline_BT['Resampling'] = rs_none
 # # Write csv
 #baseline_CV.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
 #baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
 # baseline_CV.to_csv(outdir_ml + gene.lower() + '_baseline_CV_allF.csv')
 # baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
 #----------------------------------
 #  LF*: CV + BTS + Other info
 #-----------------------------------
 # other data dependent variables
 training_size_ns = len(X)
 n_features       = len(X.columns)
 yc1              = Counter(y)
 yc1_ratio        = yc1[0]/yc1[1]
 baseline_all = baseline_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
 baseline_all = baseline_all.reset_index()
 baseline_all.rename(columns = {'index': 'original_names'}, inplace = True)
 # Indicate whether BT or CV
 bt_pattern = re.compile(r'bts_.*')
 baseline_all['Data_source'] = baseline_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
 baseline_all['score_type'] = baseline_all['original_names'].str.replace('bts_|test_', '', regex = True)
 score_type_uniqueN = set(baseline_all['score_type'])
 cL1 = list(score_type_ordermapD.keys())
 cL2 = list(score_type_uniqueN)
 if set(cL1).issubset(cL2):
    print('\nPASS: sorting df by score that is mapped onto the order I want')
    baseline_all['score_order'] = baseline_all['score_type'].map(score_type_ordermapD)
    baseline_all.sort_values(by = ['Data_source', 'score_order'], ascending = [True, True], inplace = True)
 else:
    sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
 # add cols: specific
 baseline_all['Resampling']     = rs_none
 baseline_all['training_size']  = training_size_ns
 baseline_all['trainingY_ratio']= round(yc1_ratio,2)
 baseline_all['n_features']     = n_features
 ###############################################################################
 #%% SMOTE NC: Smote Oversampling [Numerical + categorical]
 #================
 # Baseline
 # SMOTE NC: SMNC
 #================
 smnc_scores_mmD = MultModelsCl(input_df = X_smnc
                    , target = y_smnc
                    , var_type = 'mixed'
                    , skf_cv = skf_cv
                    , blind_test_df = X_bts
                    , blind_test_target = y_bts
                    , add_cm = True 
                    , add_yn = True)
 smnc_all_scores = pd.DataFrame(smnc_scores_mmD)
 rs_smnc = 'smnc'
 #------------------------
 #  WF: only CV and BTS
 #-----------------------
 smnc_allT = smnc_all_scores.T
 smnc_CV = smnc_allT.filter(regex='test_', axis = 1); smnc_CV.columns
 # map colnames for consistency to allow concatenting
 smnc_CV.columns = smnc_CV.columns.map(scoreCV_mapD); smnc_CV.columns
 smnc_CV['Data_source'] = 'CV'
 smnc_CV['Resampling']  = rs_smnc
 smnc_BT = smnc_allT.filter(regex='bts_', axis = 1); smnc_BT.columns
 # map colnames for consistency to allow concatenting
 smnc_BT.columns = smnc_BT.columns.map(scoreBT_mapD); smnc_BT.columns
 smnc_BT['Data_source'] = 'BT'
 smnc_BT['Resampling']  = rs_smnc
 # Write csv
 # smnc_CV.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
 # smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
 # smnc_CV.to_csv(outdir_ml + gene.lower() + '_smnc_CV_allF.csv')
 # smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
 #----------------------------------
 #  LF*: CV + BTS + Other info
 #-----------------------------------
 # other data dependent variables
 training_size_smnc = len(X_smnc)
 n_features         = len(X_smnc.columns)
 yc1_smnc              = Counter(y_smnc)
 yc1_ratio_smnc        = yc1_smnc[0]/yc1_smnc[1]
 smnc_all = smnc_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
 smnc_all = smnc_all.reset_index()
 smnc_all.rename(columns = {'index': 'original_names'}, inplace = True)
 # Indicate whether BT or CV
 bt_pattern = re.compile(r'bts_.*')
 smnc_all['Data_source'] = smnc_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
 smnc_all['score_type'] = smnc_all['original_names'].str.replace('bts_|test_', '', regex = True)
 score_type_uniqueN = set(smnc_all['score_type'])
 cL1 = list(score_type_ordermapD.keys())
 cL2 = list(score_type_uniqueN)
 if set(cL1).issubset(cL2):
    print('\nPASS: sorting df by score that is mapped onto the order I want')
    smnc_all['score_order'] = smnc_all['score_type'].map(score_type_ordermapD)
    smnc_all.sort_values(by = ['Data_source', 'score_order'], ascending = [True, True], inplace = True)
 else:
    sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
 # add cols: specific
 smnc_all['Resampling']      = rs_smnc
 smnc_all['training_size']   = training_size_smnc
 smnc_all['trainingY_ratio'] = round(yc1_ratio_smnc,2)
 smnc_all['n_features']      = n_features
 ###############################################################################
 #%% ROS: Random Over Sampling [Numerical + categorical]
 #================
 # Baseline
 # ROS
 #================
 ros_scores_mmD = MultModelsCl(input_df = X_ros
                    , target = y_ros
                    , var_type = 'mixed'
                    , skf_cv = skf_cv
                    , blind_test_df = X_bts
                    , blind_test_target = y_bts
                    , add_cm = True 
                    , add_yn = True)
 ros_all_scores = pd.DataFrame(ros_scores_mmD)
 rs_ros = 'ros'
 #------------------------
 #  WF: only CV and BTS
 #-----------------------
 ros_allT = ros_all_scores.T
 ros_CV  = ros_allT.filter(regex='test_', axis = 1); ros_CV.columns
 # map colnames for consistency to allow concatenting
 ros_CV.columns = ros_CV.columns.map(scoreCV_mapD); ros_CV.columns
 ros_CV['Data_source'] = 'CV'
 ros_CV['Resampling'] = rs_ros
 ros_BT  = ros_allT.filter(regex='bts_', axis = 1); ros_BT.columns
 # map colnames for consistency to allow concatenting
 ros_BT.columns = ros_BT.columns.map(scoreBT_mapD); ros_BT.columns
 ros_BT['Data_source'] = 'BT'
 ros_BT['Resampling'] = rs_ros
 # Write csv
 # ros_CV.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
 # ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
 # ros_CV.to_csv(outdir_ml + gene.lower() + '_ros_CV_allF.csv')
 # ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
 #----------------------------------
 #  LF*: CV + BTS + Other info
 #----------------------------------
 # other data dependent variables
 training_size_ros = len(X_ros)
 n_features        = len(X_ros.columns)
 yc1_ros             = Counter(y_ros)
 yc1_ratio_ros       = yc1_ros[0]/yc1_ros[1]
 ros_all = ros_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
 ros_all = ros_all.reset_index()
 ros_all.rename(columns = {'index': 'original_names'}, inplace = True)
 # Indicate whether BT or CV
 bt_pattern = re.compile(r'bts_.*')
 ros_all['Data_source'] = ros_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
 ros_all['score_type'] = ros_all['original_names'].str.replace('bts_|test_', '', regex = True)
 score_type_uniqueN = set(ros_all['score_type'])
 cL1 = list(score_type_ordermapD.keys())
 cL2 = list(score_type_uniqueN)
 if set(cL1).issubset(cL2):
    print('\nPASS: sorting df by score that is mapped onto the order I want')
    ros_all['score_order'] = ros_all['score_type'].map(score_type_ordermapD)
    ros_all.sort_values(by = ['Data_source', 'score_order'], ascending = [True, True], inplace = True)
 else:
    sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
 # add cols: specific
 ros_all['Resampling']      = rs_ros
 ros_all['training_size']   = training_size_ros
 ros_all['trainingY_ratio'] = round(yc1_ratio_ros,2)
 ros_all['n_features']      = n_features
 ###############################################################################
 #%% RUS: Random Under Sampling [Numerical + categorical]
 #================
 # Baseline
 # RUS
 #================
 rus_scores_mmD = MultModelsCl(input_df = X_rus
                    , target = y_rus
                    , var_type = 'mixed'
                    , skf_cv = skf_cv
                    , blind_test_df = X_bts
                    , blind_test_target = y_bts
                    , add_cm = True 
                    , add_yn = True)
 rus_all_scores = pd.DataFrame(rus_scores_mmD)
 rs_rus = 'rus'
 #-----------------------
 #  WF: only CV and BTS
 #-----------------------
 rus_allT = rus_all_scores.T
 rus_CV = rus_allT.filter(regex='test_', axis=1); rus_CV.columns
 # map colnames for consistency to allow concatenting
 rus_CV.columns = rus_CV.columns.map(scoreCV_mapD); rus_CV.columns
 rus_CV['Data_source'] = 'CV'
 rus_CV['Resampling'] = rs_rus
 rus_BT = rus_allT.filter(regex='bts_' , axis=1); rus_BT.columns
 # map colnames for consistency to allow concatenting
 rus_BT.columns = rus_BT.columns.map(scoreBT_mapD); rus_BT.columns
 rus_BT['Data_source'] = 'BT'
 rus_BT['Resampling'] = rs_rus
 # # Write csv
 # rus_CV.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
 # rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
 # rus_CV.to_csv(outdir_ml + gene.lower() + '_rus_CV_allF.csv')
 # rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
 #----------------------------------
 #  LF*: CV + BTS + Other info
 #----------------------------------
 # other data dependent variables
 training_size_rus = len(X_rus)
 n_features        = len(X_rus.columns)
 yc1_rus             = Counter(y_rus)
 yc1_ratio_rus       = yc1_rus[0]/yc1_rus[1]
 rus_all = rus_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
 rus_all = rus_all.reset_index()
 rus_all.rename(columns = {'index': 'original_names'}, inplace = True)
 # Indicate whether BT or CV
 bt_pattern = re.compile(r'bts_.*')
 rus_all['Data_source'] = rus_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
 rus_all['score_type'] = rus_all['original_names'].str.replace('bts_|test_', '', regex = True)
 score_type_uniqueN = set(rus_all['score_type'])
 cL1 = list(score_type_ordermapD.keys())
 cL2 = list(score_type_uniqueN)
 if set(cL1).issubset(cL2):
    print('\nPASS: sorting df by score that is mapped onto the order I want')
    rus_all['score_order'] = rus_all['score_type'].map(score_type_ordermapD)
    rus_all.sort_values(by = ['Data_source', 'score_order'], ascending = [True, True], inplace = True)
 else:
    sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
 # add cols: specific
 rus_all['Resampling']      = rs_rus
 rus_all['training_size']   = training_size_rus
 rus_all['trainingY_ratio'] = round(yc1_ratio_rus,2)
 rus_all['n_features']      = n_features
 ###############################################################################
 #%% ROS+RUS Combined: [Numerical + categorical]
 #================
 # Baseline
 # ROUC
 #================
 rouC_scores_mmD = MultModelsCl(input_df = X_rouC
                    , target = y_rouC
                    , var_type = 'mixed'
                    , skf_cv = skf_cv
                    , blind_test_df = X_bts
                    , blind_test_target = y_bts
                    , add_cm = True 
                    , add_yn = True)
 rouC_all_scores = pd.DataFrame(rouC_scores_mmD)
 rs_rouC = 'rouC'
 #-----------------------
 #  WF: only CV and BTS
 #-----------------------
 rouC_allT = rouC_all_scores.T
 rouC_CV = rouC_allT.filter(regex='test_', axis=1); rouC_CV.columns
 # map colnames for consistency to allow concatenting
 rouC_CV.columns = rouC_CV.columns.map(scoreCV_mapD); rouC_CV.columns
 rouC_CV['Data_source'] = 'CV'
 rouC_CV['Resampling'] = rs_rouC
 rouC_BT = rouC_allT.filter(regex='bts_', axis=1); rouC_BT.columns
 # map colnames for consistency to allow concatenting
 rouC_BT.columns = rouC_BT.columns.map(scoreBT_mapD); rouC_BT.columns
 rouC_BT['Data_source'] = 'BT'
 rouC_BT['Resampling']  = rs_rouC
 # Write csv
 # rouC_CV.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
 # rouC_CV.to_csv(outdir_ml + gene.lower() + '_rouC_CV_allF.csv')
 # rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
 # rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')
 #----------------------------------
 #  LF*: CV + BTS + Other info
 #----------------------------------
 # other data dependent variables
 training_size_rouC = len(X_rouC)
 n_features         = len(X_rouC.columns)
 yc1_rouC           = Counter(y_rouC)
 yc1_ratio_rouC     = yc1_rouC[0]/yc1_rouC[1]
 rouC_all = rouC_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
 rouC_all = rouC_all.reset_index()
 rouC_all.rename(columns = {'index': 'original_names'}, inplace = True)
 # Indicate whether BT or CV
 bt_pattern = re.compile(r'bts_.*')
 rouC_all['Data_source'] = rouC_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
 rouC_all['score_type'] = rouC_all['original_names'].str.replace('bts_|test_', '', regex = True)
 score_type_uniqueN = set(rouC_all['score_type'])
 cL1 = list(score_type_ordermapD.keys())
 cL2 = list(score_type_uniqueN)
 if set(cL1).issubset(cL2):
    print('\nPASS: sorting df by score that is mapped onto the order I want')
    rouC_all['score_order'] = rouC_all['score_type'].map(score_type_ordermapD)
    rouC_all.sort_values(by = ['Data_source', 'score_order'], ascending = [True, True], inplace = True)
 else:
    sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
 # add cols: specific
 rouC_all['Resampling']      = rs_rouC
 rouC_all['training_size']   = training_size_rouC
 rouC_all['trainingY_ratio'] = round(yc1_ratio_rouC,2)
 rouC_all['n_features']      = n_features
 ###############################################################################
 #%% COMBINING all dfs: WF and LF
 # https://stackoverflow.com/questions/39862654/pandas-concat-of-multiple-data-frames-using-only-common-columns
 #%% Combine WF
 #-----------------
 # Combine WF
 #-----------------
 dfs_combine_wf = [baseline_BT, smnc_BT, ros_BT, rus_BT, rouC_BT,
                  baseline_CV, smnc_CV, ros_CV, rus_CV, rouC_CV]
 dfs_nrows_wf = []
 for df in dfs_combine_wf:
    dfs_nrows_wf = dfs_nrows_wf + [len(df)]
 dfs_nrows_wf = max(dfs_nrows_wf)
-dfs_ncols_wf = []
+        , 'ros_paramD': { 'input_df'           : X_ros
-for df in dfs_combine_wf:
+                        , 'target'             : y_ros
-    dfs_ncols_wf = dfs_ncols_wf + [len(df.columns)]
+                        , 'var_type'           : 'mixed'
-dfs_ncols_wf = max(dfs_ncols_wf)
+                        , 'resampling_type'    : 'ros'}
-expected_nrows_wf = len(dfs_combine_wf) * dfs_nrows_wf
+        , 'rus_paramD' : { 'input_df'          : X_rus
-expected_ncols_wf = dfs_ncols_wf
+                          , 'target'           : y_rus
                          , 'var_type'         : 'mixed'
                          , 'resampling_type'  : 'rus'}
-common_cols_wf = list(set.intersection(*(set(df.columns) for df in dfs_combine_wf)))
+        , 'rouC_paramD' : { 'input_df'         : X_rouC
                            , 'target'          : y_rouC
                            , 'var_type'        : 'mixed'
                            , 'resampling_type' : 'rouC'}
        }
-if len(common_cols_wf) == dfs_ncols_wf :
+# Initial run to get the dict containing CV, BT and metadata DFs 
-    combined_baseline_wf = pd.concat([df[common_cols_wf] for df in dfs_combine_wf], ignore_index=False)
+mmD = {}
-    resampling_methods_wf = combined_baseline_wf[['Resampling']]
+for k, v in paramD.items():
-    resampling_methods_wf = resampling_methods_wf.drop_duplicates()
+#    print(mmD[k])
-    print('\nConcatenating dfs with different resampling methods [WF]:', tts_split
+    scores_7030D = MultModelsCl(**paramD[k]
-          , '\nNo. of dfs combining:', len(dfs_combine_wf)
+                        , tts_split_type = tts_split_7030
-          , '\nThe sampling methods are:'
+                        , skf_cv = skf_cv
-          , '\n', resampling_methods_wf)
+                        , blind_test_df = X_bts
-    if len(combined_baseline_wf) == expected_nrows_wf  and len(combined_baseline_wf.columns) == expected_ncols_wf:
+                        , blind_test_target = y_bts
-        print('\nPASS:', len(dfs_combine_wf), 'dfs successfully combined'
+                        , add_cm = True 
-              , '\nnrows in combined_df_wf:', len(combined_baseline_wf)
+                        , add_yn = True
-              , '\nncols in combined_df_wf:', len(combined_baseline_wf.columns))
+                        , return_formatted_output = True)
-    else:
+    mmD[k] = scores_7030D
-        print('\nFAIL: concatenating failed'
+
-              , '\nExpected nrows:', expected_nrows_wf
+# Extracting the dfs from within the dict and concatenating to output as one df
-              , '\nGot:', len(combined_baseline_wf)
+for k, v in mmD.items():
-              , '\nExpected ncols:', expected_ncols_wf
+    out_wf_7030 = pd.concat(mmD, ignore_index = True)
              , '\nGot:', len(combined_baseline_wf.columns))
        sys.exit()
 else:
    sys.exit('\nConcatenting dfs not possible [WF],check numbers ')
-# Add index as a column
+print('\n######################################################################'
-combined_baseline_wf.columns
+      , '\nEnd--> Successfully generated output DF for Multiple classifiers (baseline models)'
-combined_baseline_wf = combined_baseline_wf.reset_index()
+      , '\nGene:', gene.lower()
-combined_baseline_wf.rename(columns = {'index': 'Model_name'}, inplace = True)
+      , '\nDrug:', drug
-combined_baseline_wf.head()
+      , '\noutput file:', outFile_wf
-
+      , '\nDim of output:', out_wf_7030.shape
-# sort df: Resampling, Data_source, and MCC
+      , '\n######################################################################')
 combined_baseline_wf.sort_values(by = ['Resampling', 'Data_source', 'MCC'], ascending = [True, True, False], inplace = True)
 ##############################################################################
 #%% Combine LF
 #-----------------
 # Combine LF*
 #-----------------
 dfs_combine = [baseline_all, smnc_all, ros_all, rus_all, rouC_all ]
 dfs_nrows = []
 for df in dfs_combine:
    dfs_nrows = dfs_nrows + [len(df)]
 dfs_nrows = max(dfs_nrows)
 dfs_ncols = []
 for df in dfs_combine:
    dfs_ncols = dfs_ncols + [len(df.columns)]
 dfs_ncols = max(dfs_ncols)
 # dfs_ncols = []
 # dfs_ncols2 = mode(dfs_ncols.append(len(df.columns) for df in dfs_combine)
 # dfs_ncols2
 expected_nrows = len(dfs_combine) * dfs_nrows
 expected_ncols = dfs_ncols
 common_cols = list(set.intersection(*(set(df.columns) for df in dfs_combine)))
 if len(common_cols) == dfs_ncols :
    combined_baseline = pd.concat([df[common_cols] for df in dfs_combine], ignore_index=True)
    resampling_methods = combined_baseline[['Resampling', 'training_size']]
    resampling_methods = resampling_methods.drop_duplicates()
    print('\nConcatenating dfs with different resampling methods:', tts_split
          , '\nNo. of dfs combining:', len(dfs_combine)
          , '\nThe sampling methods are:'
          , '\n', resampling_methods)
    if len(combined_baseline) == expected_nrows  and len(combined_baseline.columns) == expected_ncols:
        print('\nPASS:', len(dfs_combine), 'dfs successfully combined'
              , '\nnrows in combined_df:', len(combined_baseline)
              , '\nncols in combined_df:', len(combined_baseline.columns))
    else:
        print('\nFAIL: concatenating failed'
              , '\nExpected nrows:', expected_nrows
              , '\nGot:', len(combined_baseline)
              , '\nExpected ncols:', expected_ncols
              , '\nGot:', len(combined_baseline.columns))
        sys.exit()
 else:
    sys.exit('\nConcatenting dfs not possible,check numbers ')
 # Add further column indications
 combined_baseline['test_size']   = bts_size
 combined_baseline['tts_split']   = tts_split
 combined_baseline['testY_ratio'] = round(yc2_ratio,2)
 #combined_baseline.columns
 # change to column names to be lower case for consistency
 combined_baseline.rename(columns = {'Resampling'   : 'resampling'
                                    , 'Data_source': 'data_source'}, inplace = True)
 combined_baseline.columns
 # sort df: resampling, data_source, mcc
 combined_baseline.sort_values(by = ['resampling', 'data_source', 'score_order'], ascending = [True, True, True], inplace = True)
 # # rpow bind 
 # if all(XXX):
 #     print('\nPASS:colnames match, proceeding to rowbind')
 #     comb_df = pd.concat()], axis = 0, ignore_index = True ) combined_baseline
 ###############################################################################
 #====================
 # Write output file
 #====================
-combined_baseline_wf.to_csv(outFile_wf, index = False)
+#out_wf_7030.to_csv(outFile_wf, index = False)
 print('\nFile successfully written:', outFile_wf)
 combined_baseline.to_csv(outFile_lf, index = False)
 print('\nFile successfully written:', outFile_lf)
 ###############################################################################
--- a/scripts/ml/run_7030_LOOP.py
+++ b/scripts/ml/run_7030_LOOP.py
@ -9,6 +9,8 @@ Created on Mon Jun 20 13:05:23 2022
 import re
 import argparse
 import os, sys
 import collections
 # gene  = 'pncA'
 # drug  = 'pyrazinamide'
 #total_mtblineage_uc = 8
@ -25,6 +27,7 @@ import os, sys
 ###############################################################################
 homedir = os.path.expanduser("~")
 sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml')
 ###############################################################################
 #==================
 # Import data
@ -54,79 +57,70 @@ outdir_ml = outdir + 'ml/tts_7030/'
 print('\nOutput directory:', outdir_ml)
 outFile_wf = outdir_ml + gene.lower() + '_baselineC_' + OutFile_suffix + '.csv'
-outFile_lf = outdir_ml + gene.lower() + '_baselineC_ext_' + OutFile_suffix + '.csv'
+#outFile_lf = outdir_ml + gene.lower() + '_baselineC_ext_' + OutFile_suffix + '.csv'
 #%% Running models ############################################################
 print('\n#####################################################################\n'
-      , '\nRunning ML analysis: feature groups '
+      , '\nStarting--> Running ML analysis: Baseline modes (No FS)'
      , '\nGene name:', gene
-      , '\nDrug name:', drug)
+      , '\nDrug name:', drug
      , '\n#####################################################################\n')
-fooD = {'baseline_paramD': {
+paramD = {
-                   'input_df': X
+        'baseline_paramD': { 'input_df'        : X
-                   , 'target': y
+                            , 'target'         : y
-                   , 'var_type': 'mixed'
+                            , 'var_type'       : 'mixed'
-                   , 'resampling_type': 'none'}
+                            , 'resampling_type': 'none'}
-        ,
+        
-        'smnc_paramD': {'input_df': X_smnc
+        , 'smnc_paramD': { 'input_df'          : X_smnc
-                   , 'target': y_smnc
+                          , 'target'           : y_smnc
-                   , 'var_type': 'mixed'
+                          , 'var_type'         : 'mixed'
-                   , 'resampling_type': 'smnc'}
+                          , 'resampling_type'  : 'smnc'}
-}
+    
        , 'ros_paramD': { 'input_df'           : X_ros
                        , 'target'             : y_ros
                        , 'var_type'           : 'mixed'
                        , 'resampling_type'    : 'ros'}
-barD = {}
+        , 'rus_paramD' : { 'input_df'          : X_rus
-for k, v in fooD.items():
+                          , 'target'           : y_rus
-    #print(k)
+                          , 'var_type'         : 'mixed'
-    print(fooD[k])
+                          , 'resampling_type'  : 'rus'}
-    scores_7030D = MultModelsCl(**fooD[k]
+
        , 'rouC_paramD' : { 'input_df'         : X_rouC
                            , 'target'          : y_rouC
                            , 'var_type'        : 'mixed'
                            , 'resampling_type' : 'rouC'}
        }
 # Initial run to get the dict containing CV, BT and metadata DFs 
 mmD = {}
 for k, v in paramD.items():
 #    print(fooD[k])
    scores_7030D = MultModelsCl(**paramD[k]
                        , tts_split_type = tts_split_7030
                        , skf_cv = skf_cv
                        , blind_test_df = X_bts
                        , blind_test_target = y_bts
                        , add_cm = True 
-                        , add_yn = True)
+                        , add_yn = True
-    barD[k] = scores_7030D
+                        , return_formatted_output = True)
    mmD[k] = scores_7030D
-
+for k, v in mmD.items():
-ros_paramD = {input_df = X_ros
+    out_wf_7030 = pd.concat(mmD, ignore_index = True)
-                   , target = y_ros
+    
-                   , var_type = 'mixed'
+print('\n######################################################################'
-                   , resampling_type = 'smnc'}
+      , '\nEnd--> Successfully generated output DF for Multiple classifiers (baseline models)'
-
+      , '\nGene:', gene.lower()
-
+      , '\nDrug:', drug
-rus_paramD = {input_df = X_rus
+      , '\noutput file:', outFile_wf
-                   , target = y_rus
+      , '\nDim of output:', out_wf_7030.shape
-                   , var_type = 'mixed'
+      , '\n######################################################################')
                   , resampling_type = 'rus'}
 rouC_paramD = {input_df = X_rouC
                   , target = y_rouC
                   , var_type = 'mixed'
                   , resampling_type = 'rouC'}
 #====
 scores_7030D = MultModelsCl(**rouC_paramD
                    , tts_split_type = tts_split_7030
                    , skf_cv = skf_cv
                    , blind_test_df = X_bts
                    , blind_test_target = y_bts
                    , add_cm = True 
                    , add_yn = True)
 ###############################################################################
 ###############################################################################
 #%% COMBINING all dfs: WF and LF
 # https://stackoverflow.com/questions/39862654/pandas-concat-of-multiple-data-frames-using-only-common-columns
 ###############################################################################
 #====================
 # Write output file
 #====================
-#combined_baseline_wf.to_csv(outFile_wf, index = False)
+out_wf_7030.to_csv(outFile_wf, index = False)
-#print('\nFile successfully written:', outFile_wf)
+print('\nFile successfully written:', outFile_wf)
 ###############################################################################
--- a/scripts/ml/running_ml_scripts.txt
+++ b/scripts/ml/running_ml_scripts.txt
@ -11,6 +11,7 @@ time ./run_7030.py -g katG -d isoniazid 2>&1 | tee log_katg_7030.txt
 time ./run_7030.py -g rpoB -d rifampicin 2>&1 | tee log_rpob_7030.txt
 time ./run_7030.py -g gid -d streptomycin 2>&1 | tee log_gid_7030.txt
 time ./run_7030.py -g alr -d cycloserine 2>&1 | tee log_alr_7030.txt
 # alr: # ERROR, as expected, too few values!
 # gid: problems
 ########################################################################
@ -73,9 +74,11 @@ time ./run_7030.py -g alr -d cycloserine 2>&1 | tee log_alr_7030.txt
 ########################################################################
 ########################################################################
 # running feature selection
 # Split:70/30
 time ./run_FS.py -g pncA -d pyrazinamide 2>&1 | tee log_FS_pnca_7030.txt
 real	338m26.705s
 user	1946m12.173s
 sys	189m40.122s
--- a/scripts/ml/test_MultClfs.py
+++ b/scripts/ml/test_MultClfs.py
@ -7,21 +7,24 @@ Created on Fri Jun 24 11:07:05 2022
 """
 import re
 import argparse
 import os, sys
 ###############################################################################
 # gene  = 'pncA'
 # drug  = 'pyrazinamide'
 #total_mtblineage_uc = 8
-#%% command line args: case sensitive
+# #%% command line args: case sensitive
-arg_parser = argparse.ArgumentParser()
+# arg_parser = argparse.ArgumentParser()
-arg_parser.add_argument('-d', '--drug', help = 'drug name', default = 'pncA')
+# arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '')
-arg_parser.add_argument('-g', '--gene', help = 'gene name', default = 'pyrazinamide') 
+# arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '') 
-args = arg_parser.parse_args()
+# args = arg_parser.parse_args()
-drug    = args.drug
+# drug    = args.drug
-gene    = args.gene
+# gene    = args.gene
 ###############################################################################
 homedir = os.path.expanduser("~")
 sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml')
 ###############################################################################
 #==================
@ -79,7 +82,7 @@ mmD = MultModelsCl(input_df = X_smnc
 #================
 # MultModelsCl: WITH formatted output
 #================
-mmDF = MultModelsCl(input_df = X_smnc
+mmDF3 = MultModelsCl(input_df = X_smnc
                    , target = y_smnc
                    , var_type = 'mixed'
                    , tts_split_type = tts_split_7030
@ -96,4 +99,4 @@ mmDF = MultModelsCl(input_df = X_smnc
 # test function
 #=================
 # output from function call 
-ProcessMultModelCl(smnc_scores_mmD)
+ProcessMultModelsCl(mmD)