optimised run_7030.py to generate ouput from dict now that the processfunction and parameter dicts have been added

2022-06-24 15:40:18 +01:00 · 2022-06-24 15:40:18 +01:00 · b37a950fec
commit b37a950fec
parent 7dc7e25016
12 changed files with 180 additions and 128408 deletions
--- a/scripts/ml/MultClfs.py
+++ b/scripts/ml/MultClfs.py
@ -197,35 +197,35 @@ def MultModelsCl(input_df, target, skf_cv
    # Specify multiple Classification Models  
    #======================================================
    models = [('AdaBoost Classifier'     , AdaBoostClassifier(**rs) )
-            #  , ('Bagging Classifier'        , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True) )
-            #   , ('Decision Tree'             , DecisionTreeClassifier(**rs) ) 
-            #   , ('Extra Tree'                , ExtraTreeClassifier(**rs) )
-            #   , ('Extra Trees'               , ExtraTreesClassifier(**rs) ) 
-            #   , ('Gradient Boosting'         , GradientBoostingClassifier(**rs) )
-            #   , ('Gaussian NB'               , GaussianNB() )
-            #   , ('Gaussian Process'          , GaussianProcessClassifier(**rs) )
-            #   , ('K-Nearest Neighbors'       , KNeighborsClassifier() ) 
-            #   , ('LDA'                       , LinearDiscriminantAnalysis() )
+               , ('Bagging Classifier'        , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True) )
+               , ('Decision Tree'             , DecisionTreeClassifier(**rs) ) 
+               , ('Extra Tree'                , ExtraTreeClassifier(**rs) )
+               , ('Extra Trees'               , ExtraTreesClassifier(**rs) ) 
+               , ('Gradient Boosting'         , GradientBoostingClassifier(**rs) )
+               , ('Gaussian NB'               , GaussianNB() )
+               , ('Gaussian Process'          , GaussianProcessClassifier(**rs) )
+               , ('K-Nearest Neighbors'       , KNeighborsClassifier() ) 
+               , ('LDA'                       , LinearDiscriminantAnalysis() )
               , ('Logistic Regression'       , LogisticRegression(**rs) )
-            #   , ('Logistic RegressionCV'     , LogisticRegressionCV(cv = 3, **rs))
-            #   , ('MLP'                       , MLPClassifier(max_iter = 500, **rs) ) 
-            #   , ('Multinomial'               , MultinomialNB() )
-            #   , ('Naive Bayes'               , BernoulliNB() )
-            #   , ('Passive Aggresive'         , PassiveAggressiveClassifier(**rs, **njobs) )
-            #   , ('QDA'                       , QuadraticDiscriminantAnalysis() )
-            #   , ('Random Forest'             , RandomForestClassifier(**rs, n_estimators = 1000 ) ) 
-            #    , ('Random Forest2'            , RandomForestClassifier(min_samples_leaf = 5
-            #                                                          , n_estimators     = 1000
-            #                                                          , bootstrap        = True
-            #                                                          , oob_score        = True
-            #                                                          , **njobs
-            #                                                          , **rs
-            #                                                          , max_features     = 'auto') ) 
-            #   , ('Ridge Classifier'          , RidgeClassifier(**rs)  )
-            #   , ('Ridge ClassifierCV'        , RidgeClassifierCV(cv = 3) )          
-            #   , ('SVC'                       , SVC(**rs) ) 
-            #   , ('Stochastic GDescent'       , SGDClassifier(**rs, **njobs) )
-            #   , ('XGBoost'                   , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False) )
+               , ('Logistic RegressionCV'     , LogisticRegressionCV(cv = 3, **rs))
+               , ('MLP'                       , MLPClassifier(max_iter = 500, **rs) ) 
+               , ('Multinomial'               , MultinomialNB() )
+               , ('Naive Bayes'               , BernoulliNB() )
+               , ('Passive Aggresive'         , PassiveAggressiveClassifier(**rs, **njobs) )
+               , ('QDA'                       , QuadraticDiscriminantAnalysis() )
+               , ('Random Forest'             , RandomForestClassifier(**rs, n_estimators = 1000 ) ) 
+               , ('Random Forest2'            , RandomForestClassifier(min_samples_leaf = 5
+                                                                       , n_estimators     = 1000
+                                                                       , bootstrap        = True
+                                                                       , oob_score        = True
+                                                                       , **njobs
+                                                                       , **rs
+                                                                       , max_features     = 'auto') ) 
+                , ('Ridge Classifier'          , RidgeClassifier(**rs)  )
+                , ('Ridge ClassifierCV'        , RidgeClassifierCV(cv = 3) )          
+                , ('SVC'                       , SVC(**rs) ) 
+                , ('Stochastic GDescent'       , SGDClassifier(**rs, **njobs) )
+                , ('XGBoost'                   , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False) )
             ]
                
    mm_skf_scoresD = {}
@ -440,10 +440,11 @@ def ProcessMultModelsCl(inputD = {}):
          , '\nCV df:', len(scoresDF_CV.columns)
          , '\nBT_df:', len(scoresDF_BT.columns)
          , '\nmetaDF:', len(metaDF.columns))
+    
    if  len(scoresDF_CV.columns) == len(scoresDF_BT.columns):
        print('\nFirst proceeding to rowbind CV and BT dfs:')
        expected_ncols_out = len(scoresDF_BT.columns) + len(metaDF.columns)
-        print('\nFinal output should have:',expected_ncols_out, 'columns' )
+        print('\nFinal output should have:', expected_ncols_out, 'columns' )

    #-----------------
    # Combine WF
@ -496,8 +497,7 @@ def ProcessMultModelsCl(inputD = {}):
            sys.exit('\nFIRST IF FAILS')
    else:
        print('\nConcatenting dfs not possible [WF],check numbers ')    
-    
-    
+
    #-------------------------------------
    # Combine WF+Metadata: Final output
    #-------------------------------------
@ -515,11 +515,15 @@ def ProcessMultModelsCl(inputD = {}):
        print('\nPASS: Combined df has expected ncols')
    else:
        sys.exit('\nFAIL: Length mismatch for combined_df')
+        
+    print('\nAdding column: Model_name')
+    
+    combDF['Model_name'] = combDF.index
    
    print('\n========================================================='
          , '\nSUCCESS: Ran multiple classifiers'
          , '\n=======================================================')
-        
+
    #resampling_methods_wf = combined_baseline_wf[['resampling']]
    #resampling_methods_wf = resampling_methods_wf.drop_duplicates()
              #, '\n', resampling_methods_wf)
--- a/scripts/ml/log_alr_7030.txt
+++ b/scripts/ml/log_alr_7030.txt
@ -1,72 +0,0 @@
-/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_7030.py:464: SettingWithCopyWarning: 
-A value is trying to be set on a copy of a slice from a DataFrame
-
-See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
-  mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
-1.22.4
-1.4.1
-
-aaindex_df contains non-numerical data
-
-Total no. of non-numerial columns: 2
-
-Selecting numerical data only
-
-PASS: successfully selected numerical columns only for aaindex_df
-
-Now checking for NA in the remaining aaindex_cols
-
-Counting aaindex_df cols with NA 
-ncols with NA: 4 columns 
-Dropping these... 
-Original ncols: 127
-
-Revised df ncols: 123
-
-Checking NA in revised df...
-
-PASS: cols with NA successfully dropped from aaindex_df 
-Proceeding with combining aa_df with other features_df
-
-PASS: ncols match 
-Expected ncols: 123 
-Got: 123
-
-Total no. of columns in clean aa_df: 123
-
-Proceeding to merge, expected nrows in merged_df: 271
-
-PASS: my_features_df and aa_df successfully combined 
-nrows: 271 
-ncols: 269
-count of NULL values before imputation
-
-or_mychisq          256
-log10_or_mychisq    256
-dtype: int64
-count of NULL values AFTER imputation
-
-mutationinformation    0
-or_rawI                0
-logorI                 0
-dtype: int64
-
-PASS: OR values imputed, data ready for ML
-
-Total no. of features for aaindex: 123
-
-PASS: x_features has no target variable
-
-No. of columns for x_features: 174
-Traceback (most recent call last):
-  File "/home/tanu/git/LSHTM_analysis/scripts/ml/./run_7030.py", line 35, in <module>
-    setvars(gene,drug)
-  File "/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_7030.py", line 636, in setvars
-    X, X_bts, y, y_bts = train_test_split(x_features, y_target
-  File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/sklearn/model_selection/_split.py", line 2454, in train_test_split
-    train, test = next(cv.split(X=arrays[0], y=stratify))
-  File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/sklearn/model_selection/_split.py", line 1613, in split
-    for train, test in self._iter_indices(X, y, groups):
-  File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/sklearn/model_selection/_split.py", line 1953, in _iter_indices
-    raise ValueError(
-ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.
--- a/scripts/ml/log_embb_7030.txt
+++ b/scripts/ml/log_embb_7030.txt
--- a/scripts/ml/log_gid_7030.txt
+++ b/scripts/ml/log_gid_7030.txt
--- a/scripts/ml/log_katg_7030.txt
+++ b/scripts/ml/log_katg_7030.txt
--- a/scripts/ml/log_pnca_7030.txt
+++ b/scripts/ml/log_pnca_7030.txt
--- a/scripts/ml/log_rpob_7030.txt
+++ b/scripts/ml/log_rpob_7030.txt
--- a/scripts/ml/ml_data_7030.py
+++ b/scripts/ml/ml_data_7030.py
@ -557,7 +557,7 @@ def setvars(gene,drug):
    # FG5: Genomic features
    #========================
    X_gn_mafor_Fnum =  ['maf'
-                    , 'logorI'
+                    #, 'logorI'
                    # , 'or_rawI'
                    # , 'or_mychisq'
                    # , 'or_logistic'
--- a/scripts/ml/run_7030.py
+++ b/scripts/ml/run_7030.py
@ -5,28 +5,28 @@ Created on Mon Jun 20 13:05:23 2022

@author: tanu
 """
+#%%Imports ####################################################################
 import re
 import argparse
-###############################################################################
+import os, sys
+
 # gene  = 'pncA'
 # drug  = 'pyrazinamide'
 #total_mtblineage_uc = 8
-
+###############################################################################
 #%% command line args: case sensitive
-arg_parser = argparse.ArgumentParser()
-arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '')
-arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '') 
-args = arg_parser.parse_args()
+# arg_parser = argparse.ArgumentParser()
+# arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '')
+# arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '') 
+# args = arg_parser.parse_args()

-drug    = args.drug
-gene    = args.gene
+# drug    = args.drug
+# gene    = args.gene

 ###############################################################################
-#==================
-# other vars
-#==================
-tts_split    = '70/30'
-OutFile_suffix  = '7030'
+homedir = os.path.expanduser("~")
+sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml')
+
 ###############################################################################
 #==================
 # Import data
@ -39,10 +39,15 @@ from ml_data_7030 import *
 #from UQ_yc_RunAllClfs import run_all_ML

 #====================
-# Import ML function 
+# Import ML functions 
 #====================
-# TT run all ML clfs: baseline model
-from MultModelsCl import MultModelsCl
+from MultClfs import *
+
+#==================
+# other vars
+#==================
+tts_split_7030    = '70_30'
+OutFile_suffix  = '7030'

 #==================
 # Specify outdir 
@ -51,563 +56,71 @@ outdir_ml = outdir + 'ml/tts_7030/'
 print('\nOutput directory:', outdir_ml)

 outFile_wf = outdir_ml + gene.lower() + '_baselineC_' + OutFile_suffix + '.csv'
-outFile_lf = outdir_ml + gene.lower() + '_baselineC_ext_' + OutFile_suffix + '.csv'
+#outFile_lf = outdir_ml + gene.lower() + '_baselineC_ext_' + OutFile_suffix + '.csv'

-###############################################################################
-score_type_ordermapD = { 'mcc'      : 1
-                   , 'fscore'       : 2
-                   , 'jcc'          : 3
-                   , 'precision'    : 4
-                   , 'recall'       : 5      
-                   , 'accuracy'     : 6  
-                   , 'roc_auc'      : 7
-                   , 'TN'           : 8
-                   , 'FP'           : 9
-                   , 'FN'           : 10
-                   , 'TP'           : 11  
-                   , 'trainingY_neg': 12  
-                   , 'trainingY_pos': 13    
-                   , 'blindY_neg'   : 14
-                   , 'blindY_pos'   : 15
-                   , 'fit_time'     : 16
-                   , 'score_time'   : 17
-                   }
-
-scoreCV_mapD = {'test_mcc'         : 'MCC'
-                , 'test_fscore'    : 'F1'
-                , 'test_precision' : 'Precision'
-                , 'test_recall'    : 'Recall'
-                , 'test_accuracy'  : 'Accuracy'
-                , 'test_roc_auc'   : 'ROC_AUC'
-                , 'test_jcc'       : 'JCC'
-                }
-
-scoreBT_mapD = {'bts_mcc'          : 'MCC'
-                , 'bts_fscore'     : 'F1'
-                , 'bts_precision'  : 'Precision'
-                , 'bts_recall'     : 'Recall'
-                , 'bts_accuracy'   : 'Accuracy'
-                , 'bts_roc_auc'    : 'ROC_AUC'
-                , 'bts_jcc'        : 'JCC'
-               }
-
-# data dependent variables but NOT dependent on resampling
-bts_size  = len(X_bts)
-yc2       = Counter(y_bts)
-yc2_ratio = yc2[0]/yc2[1]
-
-###############################################################################
+#%% Running models ############################################################
 print('\n#####################################################################\n'
-      , '\nRunning ML analysis: feature groups '
+      , '\nStarting--> Running ML analysis: Baseline modes (No FS)'
      , '\nGene name:', gene
-      , '\nDrug name:', drug)
+      , '\nDrug name:', drug
+      , '\n#####################################################################\n')

-#%% Basic: No Oversampling
-#================
-# Baseline
-# No resampling
-#================  
-scores_mmD = MultModelsCl(input_df = X
-                    , target = y
-                    , var_type = 'mixed'
-                    , skf_cv = skf_cv
-                    , blind_test_df = X_bts
-                    , blind_test_target = y_bts
-                    , add_cm = True 
-                    , add_yn = True)
-
-baseline_all_scores = pd.DataFrame(scores_mmD)
-rs_none = 'none'
-#------------------------
-#  WF: only CV and BTS
-#-----------------------
-baseline_allT = baseline_all_scores.T
-#baseline_train = baseline_all.filter(regex='train_', axis=1)
-
-baseline_CV = baseline_allT.filter(regex='test_', axis = 1); baseline_CV.columns
-# map colnames for consistency to allow concatenting
-baseline_CV.columns = baseline_CV.columns.map(scoreCV_mapD); baseline_CV.columns
-baseline_CV['Data_source'] = 'CV'
-baseline_CV['Resampling']  = rs_none
-
-baseline_BT = baseline_allT.filter(regex='bts_', axis = 1); baseline_BT.columns
-# map colnames for consistency to allow concatenting
-baseline_BT.columns = baseline_BT.columns.map(scoreBT_mapD); baseline_BT.columns
-baseline_BT['Data_source'] = 'BT'
-baseline_BT['Resampling'] = rs_none
-
-# # Write csv
-#baseline_CV.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
-#baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
-# baseline_CV.to_csv(outdir_ml + gene.lower() + '_baseline_CV_allF.csv')
-# baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
-
-#----------------------------------
-#  LF*: CV + BTS + Other info
-#-----------------------------------
-# other data dependent variables
-training_size_ns = len(X)
-n_features       = len(X.columns)
-yc1              = Counter(y)
-yc1_ratio        = yc1[0]/yc1[1]
-
-baseline_all = baseline_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
-baseline_all = baseline_all.reset_index()
-baseline_all.rename(columns = {'index': 'original_names'}, inplace = True)
-
-# Indicate whether BT or CV
-bt_pattern = re.compile(r'bts_.*')
-baseline_all['Data_source'] = baseline_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
-
-baseline_all['score_type'] = baseline_all['original_names'].str.replace('bts_|test_', '', regex = True)
-
-score_type_uniqueN = set(baseline_all['score_type'])
-cL1 = list(score_type_ordermapD.keys())
-cL2 = list(score_type_uniqueN)
-
-if set(cL1).issubset(cL2):
-    print('\nPASS: sorting df by score that is mapped onto the order I want')
-    baseline_all['score_order'] = baseline_all['score_type'].map(score_type_ordermapD)
-    baseline_all.sort_values(by = ['Data_source', 'score_order'], ascending = [True, True], inplace = True)
-else:
-    sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
-
-# add cols: specific
-baseline_all['Resampling']     = rs_none
-baseline_all['training_size']  = training_size_ns
-baseline_all['trainingY_ratio']= round(yc1_ratio,2)
-baseline_all['n_features']     = n_features
-
-###############################################################################
-#%% SMOTE NC: Smote Oversampling [Numerical + categorical]
-#================
-# Baseline
-# SMOTE NC: SMNC
-#================
-smnc_scores_mmD = MultModelsCl(input_df = X_smnc
-                    , target = y_smnc
-                    , var_type = 'mixed'
-                    , skf_cv = skf_cv
-                    , blind_test_df = X_bts
-                    , blind_test_target = y_bts
-                    , add_cm = True 
-                    , add_yn = True)
-
-smnc_all_scores = pd.DataFrame(smnc_scores_mmD)
-rs_smnc = 'smnc'
-#------------------------
-#  WF: only CV and BTS
-#-----------------------
-smnc_allT = smnc_all_scores.T
-
-smnc_CV = smnc_allT.filter(regex='test_', axis = 1); smnc_CV.columns
-# map colnames for consistency to allow concatenting
-smnc_CV.columns = smnc_CV.columns.map(scoreCV_mapD); smnc_CV.columns
-smnc_CV['Data_source'] = 'CV'
-smnc_CV['Resampling']  = rs_smnc
-
-smnc_BT = smnc_allT.filter(regex='bts_', axis = 1); smnc_BT.columns
-# map colnames for consistency to allow concatenting
-smnc_BT.columns = smnc_BT.columns.map(scoreBT_mapD); smnc_BT.columns
-smnc_BT['Data_source'] = 'BT'
-smnc_BT['Resampling']  = rs_smnc
-
-# Write csv
-# smnc_CV.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
-# smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
-# smnc_CV.to_csv(outdir_ml + gene.lower() + '_smnc_CV_allF.csv')
-# smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
-
-#----------------------------------
-#  LF*: CV + BTS + Other info
-#-----------------------------------
-# other data dependent variables
-training_size_smnc = len(X_smnc)
-n_features         = len(X_smnc.columns)
-yc1_smnc              = Counter(y_smnc)
-yc1_ratio_smnc        = yc1_smnc[0]/yc1_smnc[1]
-
-smnc_all = smnc_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
-smnc_all = smnc_all.reset_index()
-smnc_all.rename(columns = {'index': 'original_names'}, inplace = True)
-
-# Indicate whether BT or CV
-bt_pattern = re.compile(r'bts_.*')
-smnc_all['Data_source'] = smnc_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
-
-smnc_all['score_type'] = smnc_all['original_names'].str.replace('bts_|test_', '', regex = True)
-
-score_type_uniqueN = set(smnc_all['score_type'])
-cL1 = list(score_type_ordermapD.keys())
-cL2 = list(score_type_uniqueN)
-
-if set(cL1).issubset(cL2):
-    print('\nPASS: sorting df by score that is mapped onto the order I want')
-    smnc_all['score_order'] = smnc_all['score_type'].map(score_type_ordermapD)
-    smnc_all.sort_values(by = ['Data_source', 'score_order'], ascending = [True, True], inplace = True)
-else:
-    sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
-  
-# add cols: specific
-smnc_all['Resampling']      = rs_smnc
-smnc_all['training_size']   = training_size_smnc
-smnc_all['trainingY_ratio'] = round(yc1_ratio_smnc,2)
-smnc_all['n_features']      = n_features
-
-###############################################################################
-#%% ROS: Random Over Sampling [Numerical + categorical]
-#================
-# Baseline
-# ROS
-#================
-ros_scores_mmD = MultModelsCl(input_df = X_ros
-                    , target = y_ros
-                    , var_type = 'mixed'
-                    , skf_cv = skf_cv
-                    , blind_test_df = X_bts
-                    , blind_test_target = y_bts
-                    , add_cm = True 
-                    , add_yn = True)
-
-ros_all_scores = pd.DataFrame(ros_scores_mmD)
-rs_ros = 'ros'
-#------------------------
-#  WF: only CV and BTS
-#-----------------------
-ros_allT = ros_all_scores.T
-
-ros_CV  = ros_allT.filter(regex='test_', axis = 1); ros_CV.columns
-# map colnames for consistency to allow concatenting
-ros_CV.columns = ros_CV.columns.map(scoreCV_mapD); ros_CV.columns
-ros_CV['Data_source'] = 'CV'
-ros_CV['Resampling'] = rs_ros
-
-ros_BT  = ros_allT.filter(regex='bts_', axis = 1); ros_BT.columns
-# map colnames for consistency to allow concatenting
-ros_BT.columns = ros_BT.columns.map(scoreBT_mapD); ros_BT.columns
-ros_BT['Data_source'] = 'BT'
-ros_BT['Resampling'] = rs_ros
-
-# Write csv
-# ros_CV.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
-# ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
-# ros_CV.to_csv(outdir_ml + gene.lower() + '_ros_CV_allF.csv')
-# ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
-#----------------------------------
-#  LF*: CV + BTS + Other info
-#----------------------------------
-# other data dependent variables
-training_size_ros = len(X_ros)
-n_features        = len(X_ros.columns)
-yc1_ros             = Counter(y_ros)
-yc1_ratio_ros       = yc1_ros[0]/yc1_ros[1]
-
-ros_all = ros_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
-ros_all = ros_all.reset_index()
-ros_all.rename(columns = {'index': 'original_names'}, inplace = True)
-
-# Indicate whether BT or CV
-bt_pattern = re.compile(r'bts_.*')
-ros_all['Data_source'] = ros_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
-
-ros_all['score_type'] = ros_all['original_names'].str.replace('bts_|test_', '', regex = True)
-
-score_type_uniqueN = set(ros_all['score_type'])
-cL1 = list(score_type_ordermapD.keys())
-cL2 = list(score_type_uniqueN)
-
-if set(cL1).issubset(cL2):
-    print('\nPASS: sorting df by score that is mapped onto the order I want')
-    ros_all['score_order'] = ros_all['score_type'].map(score_type_ordermapD)
-    ros_all.sort_values(by = ['Data_source', 'score_order'], ascending = [True, True], inplace = True)
-else:
-    sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
-
-# add cols: specific
-ros_all['Resampling']      = rs_ros
-ros_all['training_size']   = training_size_ros
-ros_all['trainingY_ratio'] = round(yc1_ratio_ros,2)
-ros_all['n_features']      = n_features
-###############################################################################
-#%% RUS: Random Under Sampling [Numerical + categorical]
-#================
-# Baseline
-# RUS
-#================
-rus_scores_mmD = MultModelsCl(input_df = X_rus
-                    , target = y_rus
-                    , var_type = 'mixed'
-                    , skf_cv = skf_cv
-                    , blind_test_df = X_bts
-                    , blind_test_target = y_bts
-                    , add_cm = True 
-                    , add_yn = True)
-
-rus_all_scores = pd.DataFrame(rus_scores_mmD)
-rs_rus = 'rus'
-#-----------------------
-#  WF: only CV and BTS
-#-----------------------
-rus_allT = rus_all_scores.T
-
-rus_CV = rus_allT.filter(regex='test_', axis=1); rus_CV.columns
-# map colnames for consistency to allow concatenting
-rus_CV.columns = rus_CV.columns.map(scoreCV_mapD); rus_CV.columns
-rus_CV['Data_source'] = 'CV'
-rus_CV['Resampling'] = rs_rus
-
-rus_BT = rus_allT.filter(regex='bts_' , axis=1); rus_BT.columns
-# map colnames for consistency to allow concatenting
-rus_BT.columns = rus_BT.columns.map(scoreBT_mapD); rus_BT.columns
-rus_BT['Data_source'] = 'BT'
-rus_BT['Resampling'] = rs_rus
-
-# # Write csv
-# rus_CV.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
-# rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
-# rus_CV.to_csv(outdir_ml + gene.lower() + '_rus_CV_allF.csv')
-# rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
-
-#----------------------------------
-#  LF*: CV + BTS + Other info
-#----------------------------------
-# other data dependent variables
-training_size_rus = len(X_rus)
-n_features        = len(X_rus.columns)
-yc1_rus             = Counter(y_rus)
-yc1_ratio_rus       = yc1_rus[0]/yc1_rus[1]
-
-rus_all = rus_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
-rus_all = rus_all.reset_index()
-rus_all.rename(columns = {'index': 'original_names'}, inplace = True)
-
-# Indicate whether BT or CV
-bt_pattern = re.compile(r'bts_.*')
-rus_all['Data_source'] = rus_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
-
-rus_all['score_type'] = rus_all['original_names'].str.replace('bts_|test_', '', regex = True)
-
-score_type_uniqueN = set(rus_all['score_type'])
-cL1 = list(score_type_ordermapD.keys())
-cL2 = list(score_type_uniqueN)
-
-if set(cL1).issubset(cL2):
-    print('\nPASS: sorting df by score that is mapped onto the order I want')
-    rus_all['score_order'] = rus_all['score_type'].map(score_type_ordermapD)
-    rus_all.sort_values(by = ['Data_source', 'score_order'], ascending = [True, True], inplace = True)
-else:
-    sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
-
-# add cols: specific
-rus_all['Resampling']      = rs_rus
-rus_all['training_size']   = training_size_rus
-rus_all['trainingY_ratio'] = round(yc1_ratio_rus,2)
-rus_all['n_features']      = n_features
-
-###############################################################################
-#%% ROS+RUS Combined: [Numerical + categorical]
-#================
-# Baseline
-# ROUC
-#================
-rouC_scores_mmD = MultModelsCl(input_df = X_rouC
-                    , target = y_rouC
-                    , var_type = 'mixed'
-                    , skf_cv = skf_cv
-                    , blind_test_df = X_bts
-                    , blind_test_target = y_bts
-                    , add_cm = True 
-                    , add_yn = True)
-
-rouC_all_scores = pd.DataFrame(rouC_scores_mmD)
-rs_rouC = 'rouC'
-#-----------------------
-#  WF: only CV and BTS
-#-----------------------
-rouC_allT = rouC_all_scores.T
-
-rouC_CV = rouC_allT.filter(regex='test_', axis=1); rouC_CV.columns
-# map colnames for consistency to allow concatenting
-rouC_CV.columns = rouC_CV.columns.map(scoreCV_mapD); rouC_CV.columns
-rouC_CV['Data_source'] = 'CV'
-rouC_CV['Resampling'] = rs_rouC
-
-rouC_BT = rouC_allT.filter(regex='bts_', axis=1); rouC_BT.columns
-# map colnames for consistency to allow concatenting
-rouC_BT.columns = rouC_BT.columns.map(scoreBT_mapD); rouC_BT.columns
-rouC_BT['Data_source'] = 'BT'
-rouC_BT['Resampling']  = rs_rouC
-
-# Write csv
-# rouC_CV.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
-# rouC_CV.to_csv(outdir_ml + gene.lower() + '_rouC_CV_allF.csv')
-# rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
-# rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')
-
-#----------------------------------
-#  LF*: CV + BTS + Other info
-#----------------------------------
-# other data dependent variables
-training_size_rouC = len(X_rouC)
-n_features         = len(X_rouC.columns)
-yc1_rouC           = Counter(y_rouC)
-yc1_ratio_rouC     = yc1_rouC[0]/yc1_rouC[1]
-
-rouC_all = rouC_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
-rouC_all = rouC_all.reset_index()
-rouC_all.rename(columns = {'index': 'original_names'}, inplace = True)
-
-# Indicate whether BT or CV
-bt_pattern = re.compile(r'bts_.*')
-rouC_all['Data_source'] = rouC_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
-
-rouC_all['score_type'] = rouC_all['original_names'].str.replace('bts_|test_', '', regex = True)
-
-score_type_uniqueN = set(rouC_all['score_type'])
-cL1 = list(score_type_ordermapD.keys())
-cL2 = list(score_type_uniqueN)
-
-if set(cL1).issubset(cL2):
-    print('\nPASS: sorting df by score that is mapped onto the order I want')
-    rouC_all['score_order'] = rouC_all['score_type'].map(score_type_ordermapD)
-    rouC_all.sort_values(by = ['Data_source', 'score_order'], ascending = [True, True], inplace = True)
-else:
-    sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
-
-# add cols: specific
-rouC_all['Resampling']      = rs_rouC
-rouC_all['training_size']   = training_size_rouC
-rouC_all['trainingY_ratio'] = round(yc1_ratio_rouC,2)
-rouC_all['n_features']      = n_features
-
-###############################################################################
-#%% COMBINING all dfs: WF and LF
-# https://stackoverflow.com/questions/39862654/pandas-concat-of-multiple-data-frames-using-only-common-columns
-#%% Combine WF
-#-----------------
-# Combine WF
-#-----------------
-dfs_combine_wf = [baseline_BT, smnc_BT, ros_BT, rus_BT, rouC_BT,
-                  baseline_CV, smnc_CV, ros_CV, rus_CV, rouC_CV]
-
-dfs_nrows_wf = []
-for df in dfs_combine_wf:
-    dfs_nrows_wf = dfs_nrows_wf + [len(df)]
-dfs_nrows_wf = max(dfs_nrows_wf)
+paramD = {
+        'baseline_paramD': { 'input_df'        : X
+                            , 'target'         : y
+                            , 'var_type'       : 'mixed'
+                            , 'resampling_type': 'none'}
+        
+        , 'smnc_paramD': { 'input_df'          : X_smnc
+                          , 'target'           : y_smnc
+                          , 'var_type'         : 'mixed'
+                          , 'resampling_type'  : 'smnc'}
    
-dfs_ncols_wf = []
-for df in dfs_combine_wf:
-    dfs_ncols_wf = dfs_ncols_wf + [len(df.columns)]
-dfs_ncols_wf = max(dfs_ncols_wf)
+        , 'ros_paramD': { 'input_df'           : X_ros
+                        , 'target'             : y_ros
+                        , 'var_type'           : 'mixed'
+                        , 'resampling_type'    : 'ros'}

-expected_nrows_wf = len(dfs_combine_wf) * dfs_nrows_wf
-expected_ncols_wf = dfs_ncols_wf
+        , 'rus_paramD' : { 'input_df'          : X_rus
+                          , 'target'           : y_rus
+                          , 'var_type'         : 'mixed'
+                          , 'resampling_type'  : 'rus'}

-common_cols_wf = list(set.intersection(*(set(df.columns) for df in dfs_combine_wf)))
+        , 'rouC_paramD' : { 'input_df'         : X_rouC
+                            , 'target'          : y_rouC
+                            , 'var_type'        : 'mixed'
+                            , 'resampling_type' : 'rouC'}
+        }

-if len(common_cols_wf) == dfs_ncols_wf :
-    combined_baseline_wf = pd.concat([df[common_cols_wf] for df in dfs_combine_wf], ignore_index=False)
-    resampling_methods_wf = combined_baseline_wf[['Resampling']]
-    resampling_methods_wf = resampling_methods_wf.drop_duplicates()
-    print('\nConcatenating dfs with different resampling methods [WF]:', tts_split
-          , '\nNo. of dfs combining:', len(dfs_combine_wf)
-          , '\nThe sampling methods are:'
-          , '\n', resampling_methods_wf)
-    if len(combined_baseline_wf) == expected_nrows_wf  and len(combined_baseline_wf.columns) == expected_ncols_wf:
-        print('\nPASS:', len(dfs_combine_wf), 'dfs successfully combined'
-              , '\nnrows in combined_df_wf:', len(combined_baseline_wf)
-              , '\nncols in combined_df_wf:', len(combined_baseline_wf.columns))
-    else:
-        print('\nFAIL: concatenating failed'
-              , '\nExpected nrows:', expected_nrows_wf
-              , '\nGot:', len(combined_baseline_wf)
-              , '\nExpected ncols:', expected_ncols_wf
-              , '\nGot:', len(combined_baseline_wf.columns))
-        sys.exit()
-else:
-    sys.exit('\nConcatenting dfs not possible [WF],check numbers ')
+# Initial run to get the dict containing CV, BT and metadata DFs 
+mmD = {}
+for k, v in paramD.items():
+#    print(mmD[k])
+    scores_7030D = MultModelsCl(**paramD[k]
+                        , tts_split_type = tts_split_7030
+                        , skf_cv = skf_cv
+                        , blind_test_df = X_bts
+                        , blind_test_target = y_bts
+                        , add_cm = True 
+                        , add_yn = True
+                        , return_formatted_output = True)
+    mmD[k] = scores_7030D
+
+# Extracting the dfs from within the dict and concatenating to output as one df
+for k, v in mmD.items():
+    out_wf_7030 = pd.concat(mmD, ignore_index = True)
    
-# Add index as a column
-combined_baseline_wf.columns
-combined_baseline_wf = combined_baseline_wf.reset_index()
-combined_baseline_wf.rename(columns = {'index': 'Model_name'}, inplace = True)
-combined_baseline_wf.head()
-
-# sort df: Resampling, Data_source, and MCC
-combined_baseline_wf.sort_values(by = ['Resampling', 'Data_source', 'MCC'], ascending = [True, True, False], inplace = True)
-##############################################################################
-#%% Combine LF
-#-----------------
-# Combine LF*
-#-----------------
-dfs_combine = [baseline_all, smnc_all, ros_all, rus_all, rouC_all ]
-              
-dfs_nrows = []
-for df in dfs_combine:
-    dfs_nrows = dfs_nrows + [len(df)]
-dfs_nrows = max(dfs_nrows)
-    
-dfs_ncols = []
-for df in dfs_combine:
-    dfs_ncols = dfs_ncols + [len(df.columns)]
-dfs_ncols = max(dfs_ncols)
-           
-# dfs_ncols = []
-# dfs_ncols2 = mode(dfs_ncols.append(len(df.columns) for df in dfs_combine)
-# dfs_ncols2
-
-expected_nrows = len(dfs_combine) * dfs_nrows
-expected_ncols = dfs_ncols
-
-common_cols = list(set.intersection(*(set(df.columns) for df in dfs_combine)))
-
-if len(common_cols) == dfs_ncols :
-    combined_baseline = pd.concat([df[common_cols] for df in dfs_combine], ignore_index=True)
-    resampling_methods = combined_baseline[['Resampling', 'training_size']]
-    resampling_methods = resampling_methods.drop_duplicates()
-    print('\nConcatenating dfs with different resampling methods:', tts_split
-          , '\nNo. of dfs combining:', len(dfs_combine)
-          , '\nThe sampling methods are:'
-          , '\n', resampling_methods)
-    if len(combined_baseline) == expected_nrows  and len(combined_baseline.columns) == expected_ncols:
-        print('\nPASS:', len(dfs_combine), 'dfs successfully combined'
-              , '\nnrows in combined_df:', len(combined_baseline)
-              , '\nncols in combined_df:', len(combined_baseline.columns))
-    else:
-        print('\nFAIL: concatenating failed'
-              , '\nExpected nrows:', expected_nrows
-              , '\nGot:', len(combined_baseline)
-              , '\nExpected ncols:', expected_ncols
-              , '\nGot:', len(combined_baseline.columns))
-        sys.exit()
-else:
-    sys.exit('\nConcatenting dfs not possible,check numbers ')
-    
-# Add further column indications
-combined_baseline['test_size']   = bts_size
-combined_baseline['tts_split']   = tts_split
-combined_baseline['testY_ratio'] = round(yc2_ratio,2)
-#combined_baseline.columns
-
-# change to column names to be lower case for consistency
-combined_baseline.rename(columns = {'Resampling'   : 'resampling'
-                                    , 'Data_source': 'data_source'}, inplace = True)
-combined_baseline.columns
-
-# sort df: resampling, data_source, mcc
-combined_baseline.sort_values(by = ['resampling', 'data_source', 'score_order'], ascending = [True, True, True], inplace = True)
-
-# # rpow bind 
-# if all(XXX):
-#     print('\nPASS:colnames match, proceeding to rowbind')
-#     comb_df = pd.concat()], axis = 0, ignore_index = True ) combined_baseline
+print('\n######################################################################'
+      , '\nEnd--> Successfully generated output DF for Multiple classifiers (baseline models)'
+      , '\nGene:', gene.lower()
+      , '\nDrug:', drug
+      , '\noutput file:', outFile_wf
+      , '\nDim of output:', out_wf_7030.shape
+      , '\n######################################################################')
 ###############################################################################
 #====================
 # Write output file
 #====================
-combined_baseline_wf.to_csv(outFile_wf, index = False)
+#out_wf_7030.to_csv(outFile_wf, index = False)
 print('\nFile successfully written:', outFile_wf)
-
-combined_baseline.to_csv(outFile_lf, index = False)
-print('\nFile successfully written:', outFile_lf)
 ###############################################################################
--- a/scripts/ml/run_7030_LOOP.py
+++ b/scripts/ml/run_7030_LOOP.py
@ -9,6 +9,8 @@ Created on Mon Jun 20 13:05:23 2022
 import re
 import argparse
 import os, sys
+import collections
+
 # gene  = 'pncA'
 # drug  = 'pyrazinamide'
 #total_mtblineage_uc = 8
@ -25,6 +27,7 @@ import os, sys
 ###############################################################################
 homedir = os.path.expanduser("~")
 sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml')
+
 ###############################################################################
 #==================
 # Import data
@ -54,79 +57,70 @@ outdir_ml = outdir + 'ml/tts_7030/'
 print('\nOutput directory:', outdir_ml)

 outFile_wf = outdir_ml + gene.lower() + '_baselineC_' + OutFile_suffix + '.csv'
-outFile_lf = outdir_ml + gene.lower() + '_baselineC_ext_' + OutFile_suffix + '.csv'
+#outFile_lf = outdir_ml + gene.lower() + '_baselineC_ext_' + OutFile_suffix + '.csv'

 #%% Running models ############################################################
 print('\n#####################################################################\n'
-      , '\nRunning ML analysis: feature groups '
+      , '\nStarting--> Running ML analysis: Baseline modes (No FS)'
      , '\nGene name:', gene
-      , '\nDrug name:', drug)
+      , '\nDrug name:', drug
+      , '\n#####################################################################\n')

-fooD = {'baseline_paramD': {
-                   'input_df': X
-                   , 'target': y
-                   , 'var_type': 'mixed'
-                   , 'resampling_type': 'none'}
-        ,
-        'smnc_paramD': {'input_df': X_smnc
-                   , 'target': y_smnc
-                   , 'var_type': 'mixed'
-                   , 'resampling_type': 'smnc'}
-}
+paramD = {
+        'baseline_paramD': { 'input_df'        : X
+                            , 'target'         : y
+                            , 'var_type'       : 'mixed'
+                            , 'resampling_type': 'none'}
+        
+        , 'smnc_paramD': { 'input_df'          : X_smnc
+                          , 'target'           : y_smnc
+                          , 'var_type'         : 'mixed'
+                          , 'resampling_type'  : 'smnc'}
+    
+        , 'ros_paramD': { 'input_df'           : X_ros
+                        , 'target'             : y_ros
+                        , 'var_type'           : 'mixed'
+                        , 'resampling_type'    : 'ros'}

-barD = {}
-for k, v in fooD.items():
-    #print(k)
-    print(fooD[k])
-    scores_7030D = MultModelsCl(**fooD[k]
+        , 'rus_paramD' : { 'input_df'          : X_rus
+                          , 'target'           : y_rus
+                          , 'var_type'         : 'mixed'
+                          , 'resampling_type'  : 'rus'}
+
+        , 'rouC_paramD' : { 'input_df'         : X_rouC
+                            , 'target'          : y_rouC
+                            , 'var_type'        : 'mixed'
+                            , 'resampling_type' : 'rouC'}
+        }
+
+# Initial run to get the dict containing CV, BT and metadata DFs 
+mmD = {}
+for k, v in paramD.items():
+#    print(fooD[k])
+    scores_7030D = MultModelsCl(**paramD[k]
                        , tts_split_type = tts_split_7030
                        , skf_cv = skf_cv
                        , blind_test_df = X_bts
                        , blind_test_target = y_bts
                        , add_cm = True 
-                        , add_yn = True)
-    barD[k] = scores_7030D
+                        , add_yn = True
+                        , return_formatted_output = True)
+    mmD[k] = scores_7030D
    
-
-ros_paramD = {input_df = X_ros
-                   , target = y_ros
-                   , var_type = 'mixed'
-                   , resampling_type = 'smnc'}
-
-
-rus_paramD = {input_df = X_rus
-                   , target = y_rus
-                   , var_type = 'mixed'
-                   , resampling_type = 'rus'}
-
-
-rouC_paramD = {input_df = X_rouC
-                   , target = y_rouC
-                   , var_type = 'mixed'
-                   , resampling_type = 'rouC'}
-
-
-
-
-#====
-scores_7030D = MultModelsCl(**rouC_paramD
-                    , tts_split_type = tts_split_7030
-                    , skf_cv = skf_cv
-                    , blind_test_df = X_bts
-                    , blind_test_target = y_bts
-                    , add_cm = True 
-                    , add_yn = True)
-
-###############################################################################
-###############################################################################
-#%% COMBINING all dfs: WF and LF
-# https://stackoverflow.com/questions/39862654/pandas-concat-of-multiple-data-frames-using-only-common-columns
-
-
+for k, v in mmD.items():
+    out_wf_7030 = pd.concat(mmD, ignore_index = True)
+    
+print('\n######################################################################'
+      , '\nEnd--> Successfully generated output DF for Multiple classifiers (baseline models)'
+      , '\nGene:', gene.lower()
+      , '\nDrug:', drug
+      , '\noutput file:', outFile_wf
+      , '\nDim of output:', out_wf_7030.shape
+      , '\n######################################################################')
 ###############################################################################
 #====================
 # Write output file
 #====================
-#combined_baseline_wf.to_csv(outFile_wf, index = False)
-#print('\nFile successfully written:', outFile_wf)
+out_wf_7030.to_csv(outFile_wf, index = False)
+print('\nFile successfully written:', outFile_wf)
 ###############################################################################
--- a/scripts/ml/running_ml_scripts.txt
+++ b/scripts/ml/running_ml_scripts.txt
@ -11,6 +11,7 @@ time ./run_7030.py -g katG -d isoniazid 2>&1 | tee log_katg_7030.txt
 time ./run_7030.py -g rpoB -d rifampicin 2>&1 | tee log_rpob_7030.txt
 time ./run_7030.py -g gid -d streptomycin 2>&1 | tee log_gid_7030.txt
 time ./run_7030.py -g alr -d cycloserine 2>&1 | tee log_alr_7030.txt
+
 # alr: # ERROR, as expected, too few values!
 # gid: problems
 ########################################################################
@ -73,9 +74,11 @@ time ./run_7030.py -g alr -d cycloserine 2>&1 | tee log_alr_7030.txt
 ########################################################################
 ########################################################################

-
 # running feature selection
 # Split:70/30
 time ./run_FS.py -g pncA -d pyrazinamide 2>&1 | tee log_FS_pnca_7030.txt

+real	338m26.705s
+user	1946m12.173s
+sys	189m40.122s

--- a/scripts/ml/test_MultClfs.py
+++ b/scripts/ml/test_MultClfs.py
@ -7,21 +7,24 @@ Created on Fri Jun 24 11:07:05 2022
 """
 import re
 import argparse
+import os, sys
 ###############################################################################
 # gene  = 'pncA'
 # drug  = 'pyrazinamide'
 #total_mtblineage_uc = 8

-#%% command line args: case sensitive
-arg_parser = argparse.ArgumentParser()
-arg_parser.add_argument('-d', '--drug', help = 'drug name', default = 'pncA')
-arg_parser.add_argument('-g', '--gene', help = 'gene name', default = 'pyrazinamide') 
-args = arg_parser.parse_args()
+# #%% command line args: case sensitive
+# arg_parser = argparse.ArgumentParser()
+# arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '')
+# arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '') 
+# args = arg_parser.parse_args()

-drug    = args.drug
-gene    = args.gene
+# drug    = args.drug
+# gene    = args.gene

 ###############################################################################
+homedir = os.path.expanduser("~")
+sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml')

 ###############################################################################
 #==================
@ -79,7 +82,7 @@ mmD = MultModelsCl(input_df = X_smnc
 #================
 # MultModelsCl: WITH formatted output
 #================
-mmDF = MultModelsCl(input_df = X_smnc
+mmDF3 = MultModelsCl(input_df = X_smnc
                    , target = y_smnc
                    , var_type = 'mixed'
                    , tts_split_type = tts_split_7030
@ -96,4 +99,4 @@ mmDF = MultModelsCl(input_df = X_smnc
 # test function
 #=================
 # output from function call 
-ProcessMultModelCl(smnc_scores_mmD)
+ProcessMultModelsCl(mmD)