added metadata output for running multiple models

2022-06-23 21:25:00 +01:00 · 2022-06-23 21:25:00 +01:00 · 4fe62c072b
commit 4fe62c072b
parent 5dea35f97c
7 changed files with 325 additions and 88 deletions
--- a/scripts/ml/run_FS.py
+++ b/scripts/ml/run_FS.py
@ -5,45 +5,234 @@ Created on Tue May 24 08:11:05 2022

@author: tanu
 """
+#%%
+import os, sys
+import pandas as pd
+import numpy as np
+import pprint as pp
+from copy import deepcopy
+from sklearn import linear_model
+from sklearn import datasets
+from collections import Counter
+
+from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
+from sklearn.linear_model import RidgeClassifier, RidgeClassifierCV, SGDClassifier, PassiveAggressiveClassifier
+
+from sklearn.naive_bayes import BernoulliNB
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.svm import SVC
+from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
+from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier
+from sklearn.naive_bayes import GaussianNB
+from sklearn.gaussian_process import GaussianProcessClassifier, kernels
+from sklearn.gaussian_process.kernels import RBF, DotProduct, Matern, RationalQuadratic, WhiteKernel
+
+from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
+from sklearn.neural_network import MLPClassifier
+
+from sklearn.svm import SVC
+from xgboost import XGBClassifier
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
+
+from sklearn.compose import ColumnTransformer
+from sklearn.compose import make_column_transformer
+
+from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score
+from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report
+
+# added
+from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, LeaveOneOut, KFold, RepeatedKFold, cross_val_predict
+
+from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
+from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold
+
+from sklearn.pipeline import Pipeline, make_pipeline
+
+from sklearn.feature_selection import RFE, RFECV
+
+import itertools
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+from statistics import mean, stdev, median, mode
+
+from imblearn.over_sampling import RandomOverSampler
+from imblearn.under_sampling import RandomUnderSampler
+from imblearn.over_sampling import SMOTE
+from sklearn.datasets import make_classification
+from imblearn.combine import SMOTEENN
+from imblearn.combine import SMOTETomek
+
+from imblearn.over_sampling import SMOTENC
+from imblearn.under_sampling import EditedNearestNeighbours
+from imblearn.under_sampling import RepeatedEditedNearestNeighbours
+
+from sklearn.model_selection import GridSearchCV
+from sklearn.base import BaseEstimator
+from sklearn.impute import KNNImputer as KNN
+import json
+import argparse
+import re
+###############################################################################
+#gene  = 'pncA'
+#drug  = 'pyrazinamide'
+#total_mtblineage_uc = 8
+
+#%% command line args: case sensitive
+arg_parser = argparse.ArgumentParser()
+arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '')
+arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '') 
+args = arg_parser.parse_args()
+
+drug    = args.drug
+gene    = args.gene
+
+###############################################################################
+#==================
+# other vars
+#==================
+tts_split    = '70/30'
+OutFile_suffix  = '7030_FS'
+###############################################################################
+#==================
+# Import data
+#==================
+from ml_data_7030 import *
+setvars(gene,drug)
+from ml_data_7030 import *
+
+# from YC run_all_ML: run locally
+#from UQ_yc_RunAllClfs import run_all_ML
+
+#==========================================
+# Import ML function: Feature selection
+#==========================================
+# TT run all ML clfs: feature selection
+from FS import fsgs
+
+#==================
+# Specify outdir 
+#==================
+outdir_ml = outdir + 'ml/tts_7030/fs/'
+print('\nOutput directory:', outdir_ml)
+OutFileFS = outdir_ml + gene.lower() + '_FS_' + OutFile_suffix + '.json'
+
+############################################################################
+
 ###############################################################################
 #====================
 # single model CALL
 #====================
-a_fs0 = fsgs(input_df = X
-         , target = y
-         , param_gridLd = [{'fs__min_features_to_select' : [1]}]
-         , blind_test_df = X_bts
-         , blind_test_target = y_bts
-         , estimator = LogisticRegression(**rs)
-         , use_fs = False # uses estimator as the RFECV parameter for fs. Set to TRUE if you want to supply custom_fs as shown below
-         , custom_fs = RFECV(DecisionTreeClassifier(**rs) , cv =  skf_cv, scoring = 'matthews_corrcoef')
-         , cv_method =  skf_cv
-         , var_type = 'mixed'
-         )
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+# aFS = fsgs(input_df = X
+#          , target = y
+#          , param_gridLd = [{'fs__min_features_to_select': [1]}]
+#          , blind_test_df = X_bts
+#          , blind_test_target = y_bts
+#          , estimator = LogisticRegression(**rs)
+#          , use_fs = False # uses estimator as the RFECV parameter for fs. Set to TRUE if you want to supply custom_fs as shown below
+#          , custom_fs = RFECV(DecisionTreeClassifier(**rs) , cv =  skf_cv, scoring = 'matthews_corrcoef')
+#          , cv_method =  skf_cv
+#          , var_type = 'mixed'
+#          )
+#############
+# Loop
+############
+# models_all = [
+#           ('XGBoost'                   , XGBClassifier(**rs, **njobs
+#                                                        , n_estimators = 100 # wasn't there
+#                                                        , max_depyth = 3 # wasn't there
+#                                                        , verbosity = 3
+#                                                        #, use_label_encoder = False)
+#                                                        ) )
+# ]

+models = [('AdaBoost Classifier'     , AdaBoostClassifier(**rs) )
+          ##, ('Bagging Classifier'        , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True) )
+          , ('Decision Tree'             , DecisionTreeClassifier(**rs) ) 
+          , ('Extra Tree'                , ExtraTreeClassifier(**rs) )
+          , ('Extra Trees'               , ExtraTreesClassifier(**rs) ) 
+          , ('Gradient Boosting'         , GradientBoostingClassifier(**rs) )
+          #, ('Gaussian NB'               , GaussianNB() )
+          #, ('Gaussian Process'          , GaussianProcessClassifier(**rs) )
+          #, ('K-Nearest Neighbors'       , KNeighborsClassifier() ) 
+          , ('LDA'                       , LinearDiscriminantAnalysis() )
+          , ('Logistic Regression'       , LogisticRegression(**rs) )
+          , ('Logistic RegressionCV'     , LogisticRegressionCV(cv = 3, **rs))
+          #, ('MLP'                       , MLPClassifier(max_iter = 500, **rs) ) 
+          #, ('Multinomial'               , MultinomialNB() )
+          #, ('Naive Bayes'               , BernoulliNB() )
+          , ('Passive Aggresive'         , PassiveAggressiveClassifier(**rs, **njobs) )
+          #, ('QDA'                       , QuadraticDiscriminantAnalysis() )
+          , ('Random Forest'             , RandomForestClassifier(**rs, n_estimators = 1000 ) ) 
+          , ('Random Forest2'            , RandomForestClassifier(min_samples_leaf = 5
+                                                                 , n_estimators     = 1000
+                                                                 , bootstrap        = True
+                                                                 , oob_score        = True
+                                                                 , **njobs
+                                                                 , **rs
+                                                                 , max_features     = 'auto') ) 
+          , ('Ridge Classifier'          , RidgeClassifier(**rs)  )
+          , ('Ridge ClassifierCV'        , RidgeClassifierCV(cv = 3) )          
+          #, ('SVC'                       , SVC(**rs) ) 
+          , ('Stochastic GDescent'       , SGDClassifier(**rs, **njobs) )
+          # , ('XGBoost'                   , XGBClassifier(**rs, **njobs, verbosity = 3
+          #                                                , use_label_encoder = False) )
+          ]

+print('\n#####################################################################'
+      , '\nRunning Feature Selection using classfication models (n):', len(models)
+      , '\nGene:'  , gene.lower()
+      , '\nDrug:'  , drug
+      , '\nSplit:' , tts_split
+      ,'\n####################################################################')

+for m in models:
+    print(m)
+print('\n====================================================================\n')

+out_fsD = {}
+index = 1
+for model_name, model_fn in models:
+    print('\nRunning classifier with FS:', index
+          , '\nModel_name:'               , model_name
+          , '\nModel func:'               , model_fn)
+          #, '\nList of models:', models)
+    index = index+1
+    
+    out_fsD[model_name] = fsgs(input_df = X
+              , target = y
+              , param_gridLd = [{'fs__min_features_to_select': [1]}]
+              , blind_test_df = X_bts
+              , blind_test_target = y_bts
+              , estimator = model_fn
+              , use_fs = False # uses estimator as the RFECV parameter for fs. Set to TRUE if you want to supply custom_fs as shown below
+              , custom_fs = RFECV(DecisionTreeClassifier(**rs) , cv =  skf_cv, scoring = 'matthews_corrcoef')
+              , cv_method =  skf_cv
+              , var_type = 'mixed'
+              )
+out_fsD
+#%% Checking results dict    
+tot_Ditems = sum(len(v) for v in out_fsD.values())

+checkL = []
+for k, v in out_fsD.items():
+    l = [len(out_fsD[k])]
+    checkL = checkL + l
+    n_sD = len(checkL) # no. of subDicts
+    l_sD = list(set(checkL)) # length of each subDict
+  
+print('\nTotal no.of subdicts:', n_sD)
+if len(l_sD) == 1 and tot_Ditems == n_sD*l_sD[0]:
+    print('\nPASS: successful run for all Classifiers'
+          , '\nLength of each subdict:', l_sD)

+print('\nSuccessfully ran Feature selection on', len(models), 'classifiers'
+      , '\nGene:', gene.lower()
+      , '\nDrug:', drug
+      , '\nSplit type:', tts_split
+      , '\nTotal fs models results:', len(out_fsD)
+      , '\nTotal items in output:', sum(len(v) for v in out_fsD.values()) )


 ##############################################################################
@ -52,14 +241,15 @@ a_fs0 = fsgs(input_df = X
 # Write final output file
 # https://stackoverflow.com/questions/19201290/how-to-save-a-dictionary-to-a-file
 #========================================
-# #output final dict as a json
-# outFile = 'LR_FS.json'
-# with open(outFile, 'w') as f:
-#     f.write(json.dumps(output_modelD,cls=NpEncoder))
+# Output final dict as a json
+print('\nWriting Final output file (json):', OutFileFS)
+with open(OutFileFS, 'w') as f:
+    f.write(json.dumps(out_fsD
+#                       , cls = NpEncoder
+))
    
 # # read json
-# file = 'LR_FS.json'
-# with open(file, 'r') as f:
+# with open(OutFileFS, 'r') as f:
 #     data = json.load(f)
 ##############################################################################