From 4fe62c072bc783fba2e479941e6a747fd411d700 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Thu, 23 Jun 2022 21:25:00 +0100 Subject: [PATCH] added metadata output for running multiple models --- scripts/ml/FS.py | 1 + scripts/ml/MultModelsCl.py | 106 ++++++++---- scripts/ml/ml_data_7030.py | 11 +- scripts/ml/run_7030.py | 12 +- scripts/ml/run_FS.py | 260 ++++++++++++++++++++++++++---- scripts/ml/running_ml_scripts.txt | 20 ++- scripts/ml/scrMult_CALL.py | 3 - 7 files changed, 325 insertions(+), 88 deletions(-) diff --git a/scripts/ml/FS.py b/scripts/ml/FS.py index 9d1aaef..9eba852 100755 --- a/scripts/ml/FS.py +++ b/scripts/ml/FS.py @@ -108,6 +108,7 @@ def fsgs(input_df , custom_fs = RFECV(DecisionTreeClassifier(**rs) , cv = skf_cv, scoring = 'matthews_corrcoef') , cv_method = skf_cv , var_type = ['numerical', 'categorical' , 'mixed'] + , verbose = 3 ): ''' returns diff --git a/scripts/ml/MultModelsCl.py b/scripts/ml/MultModelsCl.py index 95d8782..d50dc86 100755 --- a/scripts/ml/MultModelsCl.py +++ b/scripts/ml/MultModelsCl.py @@ -98,14 +98,25 @@ rskf_cv = RepeatedStratifiedKFold(n_splits = 10 mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)} jacc_score_fn = {'jcc': make_scorer(jaccard_score)} + + +#FIXME +#==================== +# Import ProcessFunc +#==================== + +#from ProcessMultModelCl import * #%% # Multiple Classification - Model Pipeline def MultModelsCl(input_df, target, skf_cv , blind_test_df , blind_test_target + , tts_split_type + , resampling_type = 'none' # default , add_cm = True # adds confusion matrix based on cross_val_predict , add_yn = True # adds target var class numbers - , var_type = ['numerical', 'categorical','mixed']): + , var_type = ['numerical', 'categorical','mixed'] + , return_formatted_output = True): ''' @ param input_df: input features @@ -151,37 +162,37 @@ def MultModelsCl(input_df, target, skf_cv #====================================================== # Specify multiple Classification Models #====================================================== - models = [('Logistic Regression' , LogisticRegression(**rs) ) - , ('Logistic RegressionCV' , LogisticRegressionCV(**rs) ) - , ('Gaussian NB' , GaussianNB() ) - , ('Naive Bayes' , BernoulliNB() ) - , ('K-Nearest Neighbors' , KNeighborsClassifier() ) - , ('SVC' , SVC(**rs) ) - , ('MLP' , MLPClassifier(max_iter = 500, **rs) ) - , ('Decision Tree' , DecisionTreeClassifier(**rs) ) - , ('Extra Trees' , ExtraTreesClassifier(**rs) ) - , ('Extra Tree' , ExtraTreeClassifier(**rs) ) - , ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000 ) ) - , ('Random Forest2' , RandomForestClassifier(min_samples_leaf = 5 - , n_estimators = 1000 - , bootstrap = True - , oob_score = True - , **njobs - , **rs - , max_features = 'auto') ) - , ('XGBoost' , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False) ) - , ('LDA' , LinearDiscriminantAnalysis() ) - , ('Multinomial' , MultinomialNB() ) - , ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) ) - , ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) ) - , ('AdaBoost Classifier' , AdaBoostClassifier(**rs) ) - , ('Bagging Classifier' , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True) ) - , ('Gaussian Process' , GaussianProcessClassifier(**rs) ) - , ('Gradient Boosting' , GradientBoostingClassifier(**rs) ) - , ('QDA' , QuadraticDiscriminantAnalysis() ) - , ('Ridge Classifier' , RidgeClassifier(**rs) ) - , ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 10) ) - ] + models = [('AdaBoost Classifier' , AdaBoostClassifier(**rs) ) + # , ('Bagging Classifier' , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True) ) + # , ('Decision Tree' , DecisionTreeClassifier(**rs) ) + # , ('Extra Tree' , ExtraTreeClassifier(**rs) ) + # , ('Extra Trees' , ExtraTreesClassifier(**rs) ) + # , ('Gradient Boosting' , GradientBoostingClassifier(**rs) ) + # , ('Gaussian NB' , GaussianNB() ) + # , ('Gaussian Process' , GaussianProcessClassifier(**rs) ) + # , ('K-Nearest Neighbors' , KNeighborsClassifier() ) + # , ('LDA' , LinearDiscriminantAnalysis() ) + , ('Logistic Regression' , LogisticRegression(**rs) ) + # , ('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs)) + # , ('MLP' , MLPClassifier(max_iter = 500, **rs) ) + # , ('Multinomial' , MultinomialNB() ) + # , ('Naive Bayes' , BernoulliNB() ) + # , ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) ) + # , ('QDA' , QuadraticDiscriminantAnalysis() ) + # , ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000 ) ) + # , ('Random Forest2' , RandomForestClassifier(min_samples_leaf = 5 + # , n_estimators = 1000 + # , bootstrap = True + # , oob_score = True + # , **njobs + # , **rs + # , max_features = 'auto') ) + # , ('Ridge Classifier' , RidgeClassifier(**rs) ) + # , ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) ) + # , ('SVC' , SVC(**rs) ) + # , ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) ) + # , ('XGBoost' , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False) ) + ] mm_skf_scoresD = {} @@ -314,5 +325,34 @@ def MultModelsCl(input_df, target, skf_cv mm_skf_scoresD[model_name]['bts_jcc'] = round(jaccard_score(blind_test_target, bts_predict),2) #mm_skf_scoresD[model_name]['diff_mcc'] = train_test_diff_MCC - return(mm_skf_scoresD) + #return(mm_skf_scoresD) +#%% + # ADD more info: meta data related to input and blind and resampling + # target numbers: training + yc1 = Counter(target) + yc1_ratio = yc1[0]/yc1[1] + + # target numbers: test + yc2 = Counter(blind_test_target) + yc2_ratio = yc2[0]/yc2[1] + + mm_skf_scoresD[model_name]['resampling'] = resampling_type + + mm_skf_scoresD[model_name]['training_size'] = len(input_df) + mm_skf_scoresD[model_name]['trainingY_ratio'] = round(yc1_ratio, 2) + + mm_skf_scoresD[model_name]['testSize'] = len(blind_test_df) + mm_skf_scoresD[model_name]['testY_ratio'] = round(yc2_ratio,2) + mm_skf_scoresD[model_name]['n_features'] = len(input_df.columns) + mm_skf_scoresD[model_name]['tts_split'] = tts_split_type + + #return(mm_skf_scoresD) + #============================ + # Process the dict to have WF + #============================ + if return_formatted_output: + CV_BT_metaDF = ProcessMultModelCl(mm_skf_scoresD) + return(CV_BT_metaDF) + else: + return(mm_skf_scoresD) \ No newline at end of file diff --git a/scripts/ml/ml_data_7030.py b/scripts/ml/ml_data_7030.py index 7abd35b..d7300f7 100644 --- a/scripts/ml/ml_data_7030.py +++ b/scripts/ml/ml_data_7030.py @@ -37,6 +37,8 @@ def setvars(gene,drug): import argparse import re #%% GLOBALS + tts_split = "70/30" + rs = {'random_state': 42} njobs = {'n_jobs': 10} @@ -58,12 +60,10 @@ def setvars(gene,drug): , **rs) mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)} - jacc_score_fn = {'jcc': make_scorer(jaccard_score)} - + jacc_score_fn = {'jcc': make_scorer(jaccard_score)} #%% FOR LATER: Combine ED logo data ########################################################################### - rs = {'random_state': 42} - njobs = {'n_jobs': 10} + homedir = os.path.expanduser("~") geneL_basic = ['pnca'] @@ -689,7 +689,8 @@ def setvars(gene,drug): print('\n-------------------------------------------------------------' , '\nSuccessfully split data: ALL features' , '\nactual values: training set' - , '\nimputed values: blind test set' + , '\nSplit:', tts_split + #, '\nimputed values: blind test set' , '\n\nTotal data size:', len(X) + len(X_bts) diff --git a/scripts/ml/run_7030.py b/scripts/ml/run_7030.py index 1db56ed..902c5fe 100755 --- a/scripts/ml/run_7030.py +++ b/scripts/ml/run_7030.py @@ -44,12 +44,6 @@ from ml_data_7030 import * # TT run all ML clfs: baseline model from MultModelsCl import MultModelsCl -############################################################################ -print('\n#####################################################################\n' - , '\nRunning ML analysis: feature groups ' - , '\nGene name:', gene - , '\nDrug name:', drug) - #================== # Specify outdir #================== @@ -101,7 +95,13 @@ scoreBT_mapD = {'bts_mcc' : 'MCC' bts_size = len(X_bts) yc2 = Counter(y_bts) yc2_ratio = yc2[0]/yc2[1] + ############################################################################### +print('\n#####################################################################\n' + , '\nRunning ML analysis: feature groups ' + , '\nGene name:', gene + , '\nDrug name:', drug) + #%% Basic: No Oversampling #================ # Baseline diff --git a/scripts/ml/run_FS.py b/scripts/ml/run_FS.py index b4c8fb4..30eac6d 100755 --- a/scripts/ml/run_FS.py +++ b/scripts/ml/run_FS.py @@ -5,45 +5,234 @@ Created on Tue May 24 08:11:05 2022 @author: tanu """ +#%% +import os, sys +import pandas as pd +import numpy as np +import pprint as pp +from copy import deepcopy +from sklearn import linear_model +from sklearn import datasets +from collections import Counter + +from sklearn.linear_model import LogisticRegression, LogisticRegressionCV +from sklearn.linear_model import RidgeClassifier, RidgeClassifierCV, SGDClassifier, PassiveAggressiveClassifier + +from sklearn.naive_bayes import BernoulliNB +from sklearn.neighbors import KNeighborsClassifier +from sklearn.svm import SVC +from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier +from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier +from sklearn.naive_bayes import GaussianNB +from sklearn.gaussian_process import GaussianProcessClassifier, kernels +from sklearn.gaussian_process.kernels import RBF, DotProduct, Matern, RationalQuadratic, WhiteKernel + +from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis +from sklearn.neural_network import MLPClassifier + +from sklearn.svm import SVC +from xgboost import XGBClassifier +from sklearn.naive_bayes import MultinomialNB +from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder + +from sklearn.compose import ColumnTransformer +from sklearn.compose import make_column_transformer + +from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score +from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report + +# added +from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, LeaveOneOut, KFold, RepeatedKFold, cross_val_predict + +from sklearn.model_selection import train_test_split, cross_validate, cross_val_score +from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold + +from sklearn.pipeline import Pipeline, make_pipeline + +from sklearn.feature_selection import RFE, RFECV + +import itertools +import seaborn as sns +import matplotlib.pyplot as plt + +from statistics import mean, stdev, median, mode + +from imblearn.over_sampling import RandomOverSampler +from imblearn.under_sampling import RandomUnderSampler +from imblearn.over_sampling import SMOTE +from sklearn.datasets import make_classification +from imblearn.combine import SMOTEENN +from imblearn.combine import SMOTETomek + +from imblearn.over_sampling import SMOTENC +from imblearn.under_sampling import EditedNearestNeighbours +from imblearn.under_sampling import RepeatedEditedNearestNeighbours + +from sklearn.model_selection import GridSearchCV +from sklearn.base import BaseEstimator +from sklearn.impute import KNNImputer as KNN +import json +import argparse +import re +############################################################################### +#gene = 'pncA' +#drug = 'pyrazinamide' +#total_mtblineage_uc = 8 + +#%% command line args: case sensitive +arg_parser = argparse.ArgumentParser() +arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '') +arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '') +args = arg_parser.parse_args() + +drug = args.drug +gene = args.gene + +############################################################################### +#================== +# other vars +#================== +tts_split = '70/30' +OutFile_suffix = '7030_FS' +############################################################################### +#================== +# Import data +#================== +from ml_data_7030 import * +setvars(gene,drug) +from ml_data_7030 import * + +# from YC run_all_ML: run locally +#from UQ_yc_RunAllClfs import run_all_ML + +#========================================== +# Import ML function: Feature selection +#========================================== +# TT run all ML clfs: feature selection +from FS import fsgs + +#================== +# Specify outdir +#================== +outdir_ml = outdir + 'ml/tts_7030/fs/' +print('\nOutput directory:', outdir_ml) +OutFileFS = outdir_ml + gene.lower() + '_FS_' + OutFile_suffix + '.json' + +############################################################################ + ############################################################################### #==================== # single model CALL #==================== -a_fs0 = fsgs(input_df = X - , target = y - , param_gridLd = [{'fs__min_features_to_select' : [1]}] - , blind_test_df = X_bts - , blind_test_target = y_bts - , estimator = LogisticRegression(**rs) - , use_fs = False # uses estimator as the RFECV parameter for fs. Set to TRUE if you want to supply custom_fs as shown below - , custom_fs = RFECV(DecisionTreeClassifier(**rs) , cv = skf_cv, scoring = 'matthews_corrcoef') - , cv_method = skf_cv - , var_type = 'mixed' - ) - - - - - - - - - - - - - - - - - - +# aFS = fsgs(input_df = X +# , target = y +# , param_gridLd = [{'fs__min_features_to_select': [1]}] +# , blind_test_df = X_bts +# , blind_test_target = y_bts +# , estimator = LogisticRegression(**rs) +# , use_fs = False # uses estimator as the RFECV parameter for fs. Set to TRUE if you want to supply custom_fs as shown below +# , custom_fs = RFECV(DecisionTreeClassifier(**rs) , cv = skf_cv, scoring = 'matthews_corrcoef') +# , cv_method = skf_cv +# , var_type = 'mixed' +# ) +############# +# Loop +############ +# models_all = [ +# ('XGBoost' , XGBClassifier(**rs, **njobs +# , n_estimators = 100 # wasn't there +# , max_depyth = 3 # wasn't there +# , verbosity = 3 +# #, use_label_encoder = False) +# ) ) +# ] +models = [('AdaBoost Classifier' , AdaBoostClassifier(**rs) ) + ##, ('Bagging Classifier' , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True) ) + , ('Decision Tree' , DecisionTreeClassifier(**rs) ) + , ('Extra Tree' , ExtraTreeClassifier(**rs) ) + , ('Extra Trees' , ExtraTreesClassifier(**rs) ) + , ('Gradient Boosting' , GradientBoostingClassifier(**rs) ) + #, ('Gaussian NB' , GaussianNB() ) + #, ('Gaussian Process' , GaussianProcessClassifier(**rs) ) + #, ('K-Nearest Neighbors' , KNeighborsClassifier() ) + , ('LDA' , LinearDiscriminantAnalysis() ) + , ('Logistic Regression' , LogisticRegression(**rs) ) + , ('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs)) + #, ('MLP' , MLPClassifier(max_iter = 500, **rs) ) + #, ('Multinomial' , MultinomialNB() ) + #, ('Naive Bayes' , BernoulliNB() ) + , ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) ) + #, ('QDA' , QuadraticDiscriminantAnalysis() ) + , ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000 ) ) + , ('Random Forest2' , RandomForestClassifier(min_samples_leaf = 5 + , n_estimators = 1000 + , bootstrap = True + , oob_score = True + , **njobs + , **rs + , max_features = 'auto') ) + , ('Ridge Classifier' , RidgeClassifier(**rs) ) + , ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) ) + #, ('SVC' , SVC(**rs) ) + , ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) ) + # , ('XGBoost' , XGBClassifier(**rs, **njobs, verbosity = 3 + # , use_label_encoder = False) ) + ] +print('\n#####################################################################' + , '\nRunning Feature Selection using classfication models (n):', len(models) + , '\nGene:' , gene.lower() + , '\nDrug:' , drug + , '\nSplit:' , tts_split + ,'\n####################################################################') +for m in models: + print(m) +print('\n====================================================================\n') +out_fsD = {} +index = 1 +for model_name, model_fn in models: + print('\nRunning classifier with FS:', index + , '\nModel_name:' , model_name + , '\nModel func:' , model_fn) + #, '\nList of models:', models) + index = index+1 + + out_fsD[model_name] = fsgs(input_df = X + , target = y + , param_gridLd = [{'fs__min_features_to_select': [1]}] + , blind_test_df = X_bts + , blind_test_target = y_bts + , estimator = model_fn + , use_fs = False # uses estimator as the RFECV parameter for fs. Set to TRUE if you want to supply custom_fs as shown below + , custom_fs = RFECV(DecisionTreeClassifier(**rs) , cv = skf_cv, scoring = 'matthews_corrcoef') + , cv_method = skf_cv + , var_type = 'mixed' + ) +out_fsD +#%% Checking results dict +tot_Ditems = sum(len(v) for v in out_fsD.values()) +checkL = [] +for k, v in out_fsD.items(): + l = [len(out_fsD[k])] + checkL = checkL + l + n_sD = len(checkL) # no. of subDicts + l_sD = list(set(checkL)) # length of each subDict + +print('\nTotal no.of subdicts:', n_sD) +if len(l_sD) == 1 and tot_Ditems == n_sD*l_sD[0]: + print('\nPASS: successful run for all Classifiers' + , '\nLength of each subdict:', l_sD) +print('\nSuccessfully ran Feature selection on', len(models), 'classifiers' + , '\nGene:', gene.lower() + , '\nDrug:', drug + , '\nSplit type:', tts_split + , '\nTotal fs models results:', len(out_fsD) + , '\nTotal items in output:', sum(len(v) for v in out_fsD.values()) ) ############################################################################## @@ -52,14 +241,15 @@ a_fs0 = fsgs(input_df = X # Write final output file # https://stackoverflow.com/questions/19201290/how-to-save-a-dictionary-to-a-file #======================================== -# #output final dict as a json -# outFile = 'LR_FS.json' -# with open(outFile, 'w') as f: -# f.write(json.dumps(output_modelD,cls=NpEncoder)) +# Output final dict as a json +print('\nWriting Final output file (json):', OutFileFS) +with open(OutFileFS, 'w') as f: + f.write(json.dumps(out_fsD +# , cls = NpEncoder +)) # # read json -# file = 'LR_FS.json' -# with open(file, 'r') as f: +# with open(OutFileFS, 'r') as f: # data = json.load(f) ############################################################################## diff --git a/scripts/ml/running_ml_scripts.txt b/scripts/ml/running_ml_scripts.txt index 51e2158..e391698 100644 --- a/scripts/ml/running_ml_scripts.txt +++ b/scripts/ml/running_ml_scripts.txt @@ -5,12 +5,12 @@ # captures error: 2>$1 # omitted drtype_labels ================================= -./run_7030.py -g pncA -d pyrazinamide 2>&1 | tee log_pnca_7030.txt -./run_7030.py -g embB -d ethambutol 2>&1 | tee log_embb_7030.txt -./run_7030.py -g katG -d isoniazid 2>&1 | tee log_katg_7030.txt -./run_7030.py -g rpoB -d rifampicin 2>&1 | tee log_rpob_7030.txt -./run_7030.py -g gid -d streptomycin 2>&1 | tee log_gid_7030.txt -./run_7030.py -g alr -d cycloserine 2>&1 | tee log_alr_7030.txt +time ./run_7030.py -g pncA -d pyrazinamide 2>&1 | tee log_pnca_7030.txt +time ./run_7030.py -g embB -d ethambutol 2>&1 | tee log_embb_7030.txt +time ./run_7030.py -g katG -d isoniazid 2>&1 | tee log_katg_7030.txt +time ./run_7030.py -g rpoB -d rifampicin 2>&1 | tee log_rpob_7030.txt +time ./run_7030.py -g gid -d streptomycin 2>&1 | tee log_gid_7030.txt +time ./run_7030.py -g alr -d cycloserine 2>&1 | tee log_alr_7030.txt # alr: # ERROR, as expected, too few values! # gid: problems ######################################################################## @@ -69,5 +69,13 @@ # Date: 18/05/2022 # captures error: 2>$1 ================================= +######################################################################## +######################################################################## +######################################################################## + + +# running feature selection +# Split:70/30 +time ./run_FS.py -g pncA -d pyrazinamide 2>&1 | tee log_FS_pnca_7030.txt diff --git a/scripts/ml/scrMult_CALL.py b/scripts/ml/scrMult_CALL.py index 208d534..b96c73d 100755 --- a/scripts/ml/scrMult_CALL.py +++ b/scripts/ml/scrMult_CALL.py @@ -61,9 +61,6 @@ a_fs0 = fsgs(input_df = X , var_type = 'mixed' ) ############################################### - - - ############################################################################## # my function CALL #import fsgs from UQ_FS_fn