added metadata output for running multiple models

2022-06-23 21:25:00 +01:00 · 2022-06-23 21:25:00 +01:00 · 4fe62c072b
commit 4fe62c072b
parent 5dea35f97c
7 changed files with 325 additions and 88 deletions
--- a/scripts/ml/FS.py
+++ b/scripts/ml/FS.py
@ -108,6 +108,7 @@ def fsgs(input_df
         , custom_fs = RFECV(DecisionTreeClassifier(**rs) , cv =  skf_cv, scoring = 'matthews_corrcoef')
         , cv_method =  skf_cv
         , var_type = ['numerical', 'categorical' , 'mixed']
         , verbose = 3
         ):
    '''
    returns
--- a/scripts/ml/MultModelsCl.py
+++ b/scripts/ml/MultModelsCl.py
@ -98,14 +98,25 @@ rskf_cv = RepeatedStratifiedKFold(n_splits = 10
 mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)}
 jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
 #FIXME
 #====================
 # Import ProcessFunc
 #====================
 #from ProcessMultModelCl import *
 #%%
 # Multiple Classification - Model Pipeline
 def MultModelsCl(input_df, target, skf_cv
                       , blind_test_df
                       , blind_test_target
                       , tts_split_type 
                       , resampling_type = 'none' # default
                       , add_cm = True # adds confusion matrix based on cross_val_predict
                       , add_yn = True  # adds target var class numbers
-                       , var_type = ['numerical', 'categorical','mixed']):
+                       , var_type = ['numerical', 'categorical','mixed']
                       , return_formatted_output = True):
    '''
    @ param input_df: input features 
@ -151,37 +162,37 @@ def MultModelsCl(input_df, target, skf_cv
    #======================================================
    # Specify multiple Classification Models  
    #======================================================
-    models = [('Logistic Regression'       , LogisticRegression(**rs) )
+    models = [('AdaBoost Classifier'     , AdaBoostClassifier(**rs) )
-            , ('Logistic RegressionCV'     , LogisticRegressionCV(**rs) )
+            #  , ('Bagging Classifier'        , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True) )
-            , ('Gaussian NB'               , GaussianNB() )
+            #   , ('Decision Tree'             , DecisionTreeClassifier(**rs) ) 
-            , ('Naive Bayes'               , BernoulliNB() )
+            #   , ('Extra Tree'                , ExtraTreeClassifier(**rs) )
-            , ('K-Nearest Neighbors'       , KNeighborsClassifier() ) 
+            #   , ('Extra Trees'               , ExtraTreesClassifier(**rs) ) 
-            , ('SVC'                       , SVC(**rs) ) 
+            #   , ('Gradient Boosting'         , GradientBoostingClassifier(**rs) )
-            , ('MLP'                       , MLPClassifier(max_iter = 500, **rs) ) 
+            #   , ('Gaussian NB'               , GaussianNB() )
-            , ('Decision Tree'             , DecisionTreeClassifier(**rs) ) 
+            #   , ('Gaussian Process'          , GaussianProcessClassifier(**rs) )
-            , ('Extra Trees'               , ExtraTreesClassifier(**rs) ) 
+            #   , ('K-Nearest Neighbors'       , KNeighborsClassifier() ) 
-            , ('Extra Tree'                , ExtraTreeClassifier(**rs) )
+            #   , ('LDA'                       , LinearDiscriminantAnalysis() )
-            , ('Random Forest'             , RandomForestClassifier(**rs, n_estimators = 1000 ) ) 
+               , ('Logistic Regression'       , LogisticRegression(**rs) )
-            , ('Random Forest2'            , RandomForestClassifier(min_samples_leaf = 5
+            #   , ('Logistic RegressionCV'     , LogisticRegressionCV(cv = 3, **rs))
-                                                                    , n_estimators     = 1000
+            #   , ('MLP'                       , MLPClassifier(max_iter = 500, **rs) ) 
-                                                                    , bootstrap        = True
+            #   , ('Multinomial'               , MultinomialNB() )
-                                                                    , oob_score        = True
+            #   , ('Naive Bayes'               , BernoulliNB() )
-                                                                    , **njobs
+            #   , ('Passive Aggresive'         , PassiveAggressiveClassifier(**rs, **njobs) )
-                                                                    , **rs
+            #   , ('QDA'                       , QuadraticDiscriminantAnalysis() )
-                                                                    , max_features     = 'auto') ) 
+            #   , ('Random Forest'             , RandomForestClassifier(**rs, n_estimators = 1000 ) ) 
-            , ('XGBoost'                   , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False) )
+            #    , ('Random Forest2'            , RandomForestClassifier(min_samples_leaf = 5
-            , ('LDA'                       , LinearDiscriminantAnalysis() )
+            #                                                          , n_estimators     = 1000
-            , ('Multinomial'               , MultinomialNB() )
+            #                                                          , bootstrap        = True
-            , ('Passive Aggresive'         , PassiveAggressiveClassifier(**rs, **njobs) )
+            #                                                          , oob_score        = True
-            , ('Stochastic GDescent'       , SGDClassifier(**rs, **njobs) )
+            #                                                          , **njobs
-            , ('AdaBoost Classifier'       , AdaBoostClassifier(**rs) )
+            #                                                          , **rs
-            , ('Bagging Classifier'        , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True) )
+            #                                                          , max_features     = 'auto') ) 
-            , ('Gaussian Process'          , GaussianProcessClassifier(**rs) )
+            #   , ('Ridge Classifier'          , RidgeClassifier(**rs)  )
-            , ('Gradient Boosting'         , GradientBoostingClassifier(**rs) )
+            #   , ('Ridge ClassifierCV'        , RidgeClassifierCV(cv = 3) )          
-            , ('QDA'                       , QuadraticDiscriminantAnalysis() )
+            #   , ('SVC'                       , SVC(**rs) ) 
-            , ('Ridge Classifier'          , RidgeClassifier(**rs)  )
+            #   , ('Stochastic GDescent'       , SGDClassifier(**rs, **njobs) )
-            , ('Ridge ClassifierCV'        , RidgeClassifierCV(cv = 10) )
+            #   , ('XGBoost'                   , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False) )
-            ]
+             ]
    mm_skf_scoresD = {}
@ -314,5 +325,34 @@ def MultModelsCl(input_df, target, skf_cv
        mm_skf_scoresD[model_name]['bts_jcc']       = round(jaccard_score(blind_test_target, bts_predict),2)
        #mm_skf_scoresD[model_name]['diff_mcc']      = train_test_diff_MCC
-    return(mm_skf_scoresD)
+    #return(mm_skf_scoresD)
 #%%
        # ADD more info: meta data related to input and blind and resampling
        # target numbers: training
        yc1           = Counter(target)
        yc1_ratio     = yc1[0]/yc1[1]
        # target numbers: test
        yc2       = Counter(blind_test_target)
        yc2_ratio = yc2[0]/yc2[1]
        mm_skf_scoresD[model_name]['resampling']      = resampling_type
        mm_skf_scoresD[model_name]['training_size']   = len(input_df)
        mm_skf_scoresD[model_name]['trainingY_ratio'] = round(yc1_ratio, 2)
        mm_skf_scoresD[model_name]['testSize']       = len(blind_test_df)
        mm_skf_scoresD[model_name]['testY_ratio']     = round(yc2_ratio,2)
        mm_skf_scoresD[model_name]['n_features']      = len(input_df.columns)
        mm_skf_scoresD[model_name]['tts_split']       = tts_split_type
    #return(mm_skf_scoresD)
    #============================
    # Process the dict to have WF
    #============================
    if return_formatted_output: 
        CV_BT_metaDF = ProcessMultModelCl(mm_skf_scoresD)
        return(CV_BT_metaDF)
    else:
        return(mm_skf_scoresD)
--- a/scripts/ml/ml_data_7030.py
+++ b/scripts/ml/ml_data_7030.py
@ -37,6 +37,8 @@ def setvars(gene,drug):
    import argparse
    import re
    #%% GLOBALS
    tts_split = "70/30"
    rs = {'random_state': 42}
    njobs = {'n_jobs': 10}
@ -59,11 +61,9 @@ def setvars(gene,drug):
    mcc_score_fn  = {'mcc': make_scorer(matthews_corrcoef)}
    jacc_score_fn = {'jcc': make_scorer(jaccard_score)}   
    #%% FOR LATER: Combine ED logo data
    ###########################################################################
-    rs = {'random_state': 42}
+
    njobs = {'n_jobs': 10}
    homedir = os.path.expanduser("~")
    geneL_basic     = ['pnca']
@ -689,7 +689,8 @@ def setvars(gene,drug):
    print('\n-------------------------------------------------------------'
          , '\nSuccessfully split data: ALL features'
          , '\nactual values: training set'
-          , '\nimputed values: blind test set'
+          ,  '\nSplit:', tts_split
          #, '\nimputed values: blind test set'
          , '\n\nTotal data size:', len(X) + len(X_bts)
--- a/scripts/ml/run_7030.py
+++ b/scripts/ml/run_7030.py
@ -44,12 +44,6 @@ from ml_data_7030 import *
 # TT run all ML clfs: baseline model
 from MultModelsCl import MultModelsCl
 ############################################################################
 print('\n#####################################################################\n'
      , '\nRunning ML analysis: feature groups '
      , '\nGene name:', gene
      , '\nDrug name:', drug)
 #==================
 # Specify outdir 
 #==================
@ -101,7 +95,13 @@ scoreBT_mapD = {'bts_mcc'          : 'MCC'
 bts_size  = len(X_bts)
 yc2       = Counter(y_bts)
 yc2_ratio = yc2[0]/yc2[1]
 ###############################################################################
 print('\n#####################################################################\n'
      , '\nRunning ML analysis: feature groups '
      , '\nGene name:', gene
      , '\nDrug name:', drug)
 #%% Basic: No Oversampling
 #================
 # Baseline
--- a/scripts/ml/run_FS.py
+++ b/scripts/ml/run_FS.py
@ -5,45 +5,234 @@ Created on Tue May 24 08:11:05 2022
@author: tanu
 """
 #%%
 import os, sys
 import pandas as pd
 import numpy as np
 import pprint as pp
 from copy import deepcopy
 from sklearn import linear_model
 from sklearn import datasets
 from collections import Counter
 from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
 from sklearn.linear_model import RidgeClassifier, RidgeClassifierCV, SGDClassifier, PassiveAggressiveClassifier
 from sklearn.naive_bayes import BernoulliNB
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.svm import SVC
 from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
 from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier
 from sklearn.naive_bayes import GaussianNB
 from sklearn.gaussian_process import GaussianProcessClassifier, kernels
 from sklearn.gaussian_process.kernels import RBF, DotProduct, Matern, RationalQuadratic, WhiteKernel
 from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
 from sklearn.neural_network import MLPClassifier
 from sklearn.svm import SVC
 from xgboost import XGBClassifier
 from sklearn.naive_bayes import MultinomialNB
 from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
 from sklearn.compose import ColumnTransformer
 from sklearn.compose import make_column_transformer
 from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score
 from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report
 # added
 from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, LeaveOneOut, KFold, RepeatedKFold, cross_val_predict
 from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
 from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold
 from sklearn.pipeline import Pipeline, make_pipeline
 from sklearn.feature_selection import RFE, RFECV
 import itertools
 import seaborn as sns
 import matplotlib.pyplot as plt
 from statistics import mean, stdev, median, mode
 from imblearn.over_sampling import RandomOverSampler
 from imblearn.under_sampling import RandomUnderSampler
 from imblearn.over_sampling import SMOTE
 from sklearn.datasets import make_classification
 from imblearn.combine import SMOTEENN
 from imblearn.combine import SMOTETomek
 from imblearn.over_sampling import SMOTENC
 from imblearn.under_sampling import EditedNearestNeighbours
 from imblearn.under_sampling import RepeatedEditedNearestNeighbours
 from sklearn.model_selection import GridSearchCV
 from sklearn.base import BaseEstimator
 from sklearn.impute import KNNImputer as KNN
 import json
 import argparse
 import re
 ###############################################################################
 #gene  = 'pncA'
 #drug  = 'pyrazinamide'
 #total_mtblineage_uc = 8
 #%% command line args: case sensitive
 arg_parser = argparse.ArgumentParser()
 arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '')
 arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '') 
 args = arg_parser.parse_args()
 drug    = args.drug
 gene    = args.gene
 ###############################################################################
 #==================
 # other vars
 #==================
 tts_split    = '70/30'
 OutFile_suffix  = '7030_FS'
 ###############################################################################
 #==================
 # Import data
 #==================
 from ml_data_7030 import *
 setvars(gene,drug)
 from ml_data_7030 import *
 # from YC run_all_ML: run locally
 #from UQ_yc_RunAllClfs import run_all_ML
 #==========================================
 # Import ML function: Feature selection
 #==========================================
 # TT run all ML clfs: feature selection
 from FS import fsgs
 #==================
 # Specify outdir 
 #==================
 outdir_ml = outdir + 'ml/tts_7030/fs/'
 print('\nOutput directory:', outdir_ml)
 OutFileFS = outdir_ml + gene.lower() + '_FS_' + OutFile_suffix + '.json'
 ############################################################################
 ###############################################################################
 #====================
 # single model CALL
 #====================
-a_fs0 = fsgs(input_df = X
+# aFS = fsgs(input_df = X
-         , target = y
+#          , target = y
-         , param_gridLd = [{'fs__min_features_to_select' : [1]}]
+#          , param_gridLd = [{'fs__min_features_to_select': [1]}]
-         , blind_test_df = X_bts
+#          , blind_test_df = X_bts
-         , blind_test_target = y_bts
+#          , blind_test_target = y_bts
-         , estimator = LogisticRegression(**rs)
+#          , estimator = LogisticRegression(**rs)
-         , use_fs = False # uses estimator as the RFECV parameter for fs. Set to TRUE if you want to supply custom_fs as shown below
+#          , use_fs = False # uses estimator as the RFECV parameter for fs. Set to TRUE if you want to supply custom_fs as shown below
-         , custom_fs = RFECV(DecisionTreeClassifier(**rs) , cv =  skf_cv, scoring = 'matthews_corrcoef')
+#          , custom_fs = RFECV(DecisionTreeClassifier(**rs) , cv =  skf_cv, scoring = 'matthews_corrcoef')
-         , cv_method =  skf_cv
+#          , cv_method =  skf_cv
-         , var_type = 'mixed'
+#          , var_type = 'mixed'
-         )
+#          )
-
+#############
-
+# Loop
-
+############
-
+# models_all = [
-
+#           ('XGBoost'                   , XGBClassifier(**rs, **njobs
-
+#                                                        , n_estimators = 100 # wasn't there
-
+#                                                        , max_depyth = 3 # wasn't there
-
+#                                                        , verbosity = 3
-
+#                                                        #, use_label_encoder = False)
-
+#                                                        ) )
-
+# ]
 models = [('AdaBoost Classifier'     , AdaBoostClassifier(**rs) )
          ##, ('Bagging Classifier'        , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True) )
          , ('Decision Tree'             , DecisionTreeClassifier(**rs) ) 
          , ('Extra Tree'                , ExtraTreeClassifier(**rs) )
          , ('Extra Trees'               , ExtraTreesClassifier(**rs) ) 
          , ('Gradient Boosting'         , GradientBoostingClassifier(**rs) )
          #, ('Gaussian NB'               , GaussianNB() )
          #, ('Gaussian Process'          , GaussianProcessClassifier(**rs) )
          #, ('K-Nearest Neighbors'       , KNeighborsClassifier() ) 
          , ('LDA'                       , LinearDiscriminantAnalysis() )
          , ('Logistic Regression'       , LogisticRegression(**rs) )
          , ('Logistic RegressionCV'     , LogisticRegressionCV(cv = 3, **rs))
          #, ('MLP'                       , MLPClassifier(max_iter = 500, **rs) ) 
          #, ('Multinomial'               , MultinomialNB() )
          #, ('Naive Bayes'               , BernoulliNB() )
          , ('Passive Aggresive'         , PassiveAggressiveClassifier(**rs, **njobs) )
          #, ('QDA'                       , QuadraticDiscriminantAnalysis() )
          , ('Random Forest'             , RandomForestClassifier(**rs, n_estimators = 1000 ) ) 
          , ('Random Forest2'            , RandomForestClassifier(min_samples_leaf = 5
                                                                 , n_estimators     = 1000
                                                                 , bootstrap        = True
                                                                 , oob_score        = True
                                                                 , **njobs
                                                                 , **rs
                                                                 , max_features     = 'auto') ) 
          , ('Ridge Classifier'          , RidgeClassifier(**rs)  )
          , ('Ridge ClassifierCV'        , RidgeClassifierCV(cv = 3) )          
          #, ('SVC'                       , SVC(**rs) ) 
          , ('Stochastic GDescent'       , SGDClassifier(**rs, **njobs) )
          # , ('XGBoost'                   , XGBClassifier(**rs, **njobs, verbosity = 3
          #                                                , use_label_encoder = False) )
          ]
 print('\n#####################################################################'
      , '\nRunning Feature Selection using classfication models (n):', len(models)
      , '\nGene:'  , gene.lower()
      , '\nDrug:'  , drug
      , '\nSplit:' , tts_split
      ,'\n####################################################################')
 for m in models:
    print(m)
 print('\n====================================================================\n')
 out_fsD = {}
 index = 1
 for model_name, model_fn in models:
    print('\nRunning classifier with FS:', index
          , '\nModel_name:'               , model_name
          , '\nModel func:'               , model_fn)
          #, '\nList of models:', models)
    index = index+1
    out_fsD[model_name] = fsgs(input_df = X
              , target = y
              , param_gridLd = [{'fs__min_features_to_select': [1]}]
              , blind_test_df = X_bts
              , blind_test_target = y_bts
              , estimator = model_fn
              , use_fs = False # uses estimator as the RFECV parameter for fs. Set to TRUE if you want to supply custom_fs as shown below
              , custom_fs = RFECV(DecisionTreeClassifier(**rs) , cv =  skf_cv, scoring = 'matthews_corrcoef')
              , cv_method =  skf_cv
              , var_type = 'mixed'
              )
 out_fsD
 #%% Checking results dict    
 tot_Ditems = sum(len(v) for v in out_fsD.values())
 checkL = []
 for k, v in out_fsD.items():
    l = [len(out_fsD[k])]
    checkL = checkL + l
    n_sD = len(checkL) # no. of subDicts
    l_sD = list(set(checkL)) # length of each subDict
 print('\nTotal no.of subdicts:', n_sD)
 if len(l_sD) == 1 and tot_Ditems == n_sD*l_sD[0]:
    print('\nPASS: successful run for all Classifiers'
          , '\nLength of each subdict:', l_sD)
 print('\nSuccessfully ran Feature selection on', len(models), 'classifiers'
      , '\nGene:', gene.lower()
      , '\nDrug:', drug
      , '\nSplit type:', tts_split
      , '\nTotal fs models results:', len(out_fsD)
      , '\nTotal items in output:', sum(len(v) for v in out_fsD.values()) )
 ##############################################################################
@ -52,14 +241,15 @@ a_fs0 = fsgs(input_df = X
 # Write final output file
 # https://stackoverflow.com/questions/19201290/how-to-save-a-dictionary-to-a-file
 #========================================
-# #output final dict as a json
+# Output final dict as a json
-# outFile = 'LR_FS.json'
+print('\nWriting Final output file (json):', OutFileFS)
-# with open(outFile, 'w') as f:
+with open(OutFileFS, 'w') as f:
-#     f.write(json.dumps(output_modelD,cls=NpEncoder))
+    f.write(json.dumps(out_fsD
 #                       , cls = NpEncoder
 ))
 # # read json
-# file = 'LR_FS.json'
+# with open(OutFileFS, 'r') as f:
 # with open(file, 'r') as f:
 #     data = json.load(f)
 ##############################################################################
--- a/scripts/ml/running_ml_scripts.txt
+++ b/scripts/ml/running_ml_scripts.txt
@ -5,12 +5,12 @@
 # captures error: 2>$1
 # omitted drtype_labels
 =================================
-./run_7030.py -g pncA -d pyrazinamide 2>&1 | tee log_pnca_7030.txt
+time ./run_7030.py -g pncA -d pyrazinamide 2>&1 | tee log_pnca_7030.txt
-./run_7030.py -g embB -d ethambutol 2>&1 | tee log_embb_7030.txt
+time ./run_7030.py -g embB -d ethambutol 2>&1 | tee log_embb_7030.txt
-./run_7030.py -g katG -d isoniazid 2>&1 | tee log_katg_7030.txt
+time ./run_7030.py -g katG -d isoniazid 2>&1 | tee log_katg_7030.txt
-./run_7030.py -g rpoB -d rifampicin 2>&1 | tee log_rpob_7030.txt
+time ./run_7030.py -g rpoB -d rifampicin 2>&1 | tee log_rpob_7030.txt
-./run_7030.py -g gid -d streptomycin 2>&1 | tee log_gid_7030.txt
+time ./run_7030.py -g gid -d streptomycin 2>&1 | tee log_gid_7030.txt
-./run_7030.py -g alr -d cycloserine 2>&1 | tee log_alr_7030.txt
+time ./run_7030.py -g alr -d cycloserine 2>&1 | tee log_alr_7030.txt
 # alr: # ERROR, as expected, too few values!
 # gid: problems
 ########################################################################
@ -69,5 +69,13 @@
 # Date: 18/05/2022
 # captures error: 2>$1
 =================================
 ########################################################################
 ########################################################################
 ########################################################################
 # running feature selection
 # Split:70/30
 time ./run_FS.py -g pncA -d pyrazinamide 2>&1 | tee log_FS_pnca_7030.txt
--- a/scripts/ml/scrMult_CALL.py
+++ b/scripts/ml/scrMult_CALL.py
@ -61,9 +61,6 @@ a_fs0 = fsgs(input_df = X
         , var_type = 'mixed'
         )
 ###############################################
 ##############################################################################
 # my function CALL
 #import fsgs from UQ_FS_fn