added metadata output for running multiple models

This commit is contained in:
Tanushree Tunstall 2022-06-23 21:25:00 +01:00
parent 5dea35f97c
commit 4fe62c072b
7 changed files with 325 additions and 88 deletions

View file

@ -108,6 +108,7 @@ def fsgs(input_df
, custom_fs = RFECV(DecisionTreeClassifier(**rs) , cv = skf_cv, scoring = 'matthews_corrcoef') , custom_fs = RFECV(DecisionTreeClassifier(**rs) , cv = skf_cv, scoring = 'matthews_corrcoef')
, cv_method = skf_cv , cv_method = skf_cv
, var_type = ['numerical', 'categorical' , 'mixed'] , var_type = ['numerical', 'categorical' , 'mixed']
, verbose = 3
): ):
''' '''
returns returns

View file

@ -98,14 +98,25 @@ rskf_cv = RepeatedStratifiedKFold(n_splits = 10
mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)} mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)}
jacc_score_fn = {'jcc': make_scorer(jaccard_score)} jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
#FIXME
#====================
# Import ProcessFunc
#====================
#from ProcessMultModelCl import *
#%% #%%
# Multiple Classification - Model Pipeline # Multiple Classification - Model Pipeline
def MultModelsCl(input_df, target, skf_cv def MultModelsCl(input_df, target, skf_cv
, blind_test_df , blind_test_df
, blind_test_target , blind_test_target
, tts_split_type
, resampling_type = 'none' # default
, add_cm = True # adds confusion matrix based on cross_val_predict , add_cm = True # adds confusion matrix based on cross_val_predict
, add_yn = True # adds target var class numbers , add_yn = True # adds target var class numbers
, var_type = ['numerical', 'categorical','mixed']): , var_type = ['numerical', 'categorical','mixed']
, return_formatted_output = True):
''' '''
@ param input_df: input features @ param input_df: input features
@ -151,37 +162,37 @@ def MultModelsCl(input_df, target, skf_cv
#====================================================== #======================================================
# Specify multiple Classification Models # Specify multiple Classification Models
#====================================================== #======================================================
models = [('Logistic Regression' , LogisticRegression(**rs) ) models = [('AdaBoost Classifier' , AdaBoostClassifier(**rs) )
, ('Logistic RegressionCV' , LogisticRegressionCV(**rs) ) # , ('Bagging Classifier' , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True) )
, ('Gaussian NB' , GaussianNB() ) # , ('Decision Tree' , DecisionTreeClassifier(**rs) )
, ('Naive Bayes' , BernoulliNB() ) # , ('Extra Tree' , ExtraTreeClassifier(**rs) )
, ('K-Nearest Neighbors' , KNeighborsClassifier() ) # , ('Extra Trees' , ExtraTreesClassifier(**rs) )
, ('SVC' , SVC(**rs) ) # , ('Gradient Boosting' , GradientBoostingClassifier(**rs) )
, ('MLP' , MLPClassifier(max_iter = 500, **rs) ) # , ('Gaussian NB' , GaussianNB() )
, ('Decision Tree' , DecisionTreeClassifier(**rs) ) # , ('Gaussian Process' , GaussianProcessClassifier(**rs) )
, ('Extra Trees' , ExtraTreesClassifier(**rs) ) # , ('K-Nearest Neighbors' , KNeighborsClassifier() )
, ('Extra Tree' , ExtraTreeClassifier(**rs) ) # , ('LDA' , LinearDiscriminantAnalysis() )
, ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000 ) ) , ('Logistic Regression' , LogisticRegression(**rs) )
, ('Random Forest2' , RandomForestClassifier(min_samples_leaf = 5 # , ('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs))
, n_estimators = 1000 # , ('MLP' , MLPClassifier(max_iter = 500, **rs) )
, bootstrap = True # , ('Multinomial' , MultinomialNB() )
, oob_score = True # , ('Naive Bayes' , BernoulliNB() )
, **njobs # , ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) )
, **rs # , ('QDA' , QuadraticDiscriminantAnalysis() )
, max_features = 'auto') ) # , ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000 ) )
, ('XGBoost' , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False) ) # , ('Random Forest2' , RandomForestClassifier(min_samples_leaf = 5
, ('LDA' , LinearDiscriminantAnalysis() ) # , n_estimators = 1000
, ('Multinomial' , MultinomialNB() ) # , bootstrap = True
, ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) ) # , oob_score = True
, ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) ) # , **njobs
, ('AdaBoost Classifier' , AdaBoostClassifier(**rs) ) # , **rs
, ('Bagging Classifier' , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True) ) # , max_features = 'auto') )
, ('Gaussian Process' , GaussianProcessClassifier(**rs) ) # , ('Ridge Classifier' , RidgeClassifier(**rs) )
, ('Gradient Boosting' , GradientBoostingClassifier(**rs) ) # , ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) )
, ('QDA' , QuadraticDiscriminantAnalysis() ) # , ('SVC' , SVC(**rs) )
, ('Ridge Classifier' , RidgeClassifier(**rs) ) # , ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) )
, ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 10) ) # , ('XGBoost' , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False) )
] ]
mm_skf_scoresD = {} mm_skf_scoresD = {}
@ -314,5 +325,34 @@ def MultModelsCl(input_df, target, skf_cv
mm_skf_scoresD[model_name]['bts_jcc'] = round(jaccard_score(blind_test_target, bts_predict),2) mm_skf_scoresD[model_name]['bts_jcc'] = round(jaccard_score(blind_test_target, bts_predict),2)
#mm_skf_scoresD[model_name]['diff_mcc'] = train_test_diff_MCC #mm_skf_scoresD[model_name]['diff_mcc'] = train_test_diff_MCC
return(mm_skf_scoresD) #return(mm_skf_scoresD)
#%%
# ADD more info: meta data related to input and blind and resampling
# target numbers: training
yc1 = Counter(target)
yc1_ratio = yc1[0]/yc1[1]
# target numbers: test
yc2 = Counter(blind_test_target)
yc2_ratio = yc2[0]/yc2[1]
mm_skf_scoresD[model_name]['resampling'] = resampling_type
mm_skf_scoresD[model_name]['training_size'] = len(input_df)
mm_skf_scoresD[model_name]['trainingY_ratio'] = round(yc1_ratio, 2)
mm_skf_scoresD[model_name]['testSize'] = len(blind_test_df)
mm_skf_scoresD[model_name]['testY_ratio'] = round(yc2_ratio,2)
mm_skf_scoresD[model_name]['n_features'] = len(input_df.columns)
mm_skf_scoresD[model_name]['tts_split'] = tts_split_type
#return(mm_skf_scoresD)
#============================
# Process the dict to have WF
#============================
if return_formatted_output:
CV_BT_metaDF = ProcessMultModelCl(mm_skf_scoresD)
return(CV_BT_metaDF)
else:
return(mm_skf_scoresD)

View file

@ -37,6 +37,8 @@ def setvars(gene,drug):
import argparse import argparse
import re import re
#%% GLOBALS #%% GLOBALS
tts_split = "70/30"
rs = {'random_state': 42} rs = {'random_state': 42}
njobs = {'n_jobs': 10} njobs = {'n_jobs': 10}
@ -58,12 +60,10 @@ def setvars(gene,drug):
, **rs) , **rs)
mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)} mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)}
jacc_score_fn = {'jcc': make_scorer(jaccard_score)} jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
#%% FOR LATER: Combine ED logo data #%% FOR LATER: Combine ED logo data
########################################################################### ###########################################################################
rs = {'random_state': 42}
njobs = {'n_jobs': 10}
homedir = os.path.expanduser("~") homedir = os.path.expanduser("~")
geneL_basic = ['pnca'] geneL_basic = ['pnca']
@ -689,7 +689,8 @@ def setvars(gene,drug):
print('\n-------------------------------------------------------------' print('\n-------------------------------------------------------------'
, '\nSuccessfully split data: ALL features' , '\nSuccessfully split data: ALL features'
, '\nactual values: training set' , '\nactual values: training set'
, '\nimputed values: blind test set' , '\nSplit:', tts_split
#, '\nimputed values: blind test set'
, '\n\nTotal data size:', len(X) + len(X_bts) , '\n\nTotal data size:', len(X) + len(X_bts)

View file

@ -44,12 +44,6 @@ from ml_data_7030 import *
# TT run all ML clfs: baseline model # TT run all ML clfs: baseline model
from MultModelsCl import MultModelsCl from MultModelsCl import MultModelsCl
############################################################################
print('\n#####################################################################\n'
, '\nRunning ML analysis: feature groups '
, '\nGene name:', gene
, '\nDrug name:', drug)
#================== #==================
# Specify outdir # Specify outdir
#================== #==================
@ -101,7 +95,13 @@ scoreBT_mapD = {'bts_mcc' : 'MCC'
bts_size = len(X_bts) bts_size = len(X_bts)
yc2 = Counter(y_bts) yc2 = Counter(y_bts)
yc2_ratio = yc2[0]/yc2[1] yc2_ratio = yc2[0]/yc2[1]
############################################################################### ###############################################################################
print('\n#####################################################################\n'
, '\nRunning ML analysis: feature groups '
, '\nGene name:', gene
, '\nDrug name:', drug)
#%% Basic: No Oversampling #%% Basic: No Oversampling
#================ #================
# Baseline # Baseline

View file

@ -5,45 +5,234 @@ Created on Tue May 24 08:11:05 2022
@author: tanu @author: tanu
""" """
#%%
import os, sys
import pandas as pd
import numpy as np
import pprint as pp
from copy import deepcopy
from sklearn import linear_model
from sklearn import datasets
from collections import Counter
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.linear_model import RidgeClassifier, RidgeClassifierCV, SGDClassifier, PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.gaussian_process import GaussianProcessClassifier, kernels
from sklearn.gaussian_process.kernels import RBF, DotProduct, Matern, RationalQuadratic, WhiteKernel
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score
from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report
# added
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, LeaveOneOut, KFold, RepeatedKFold, cross_val_predict
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_selection import RFE, RFECV
import itertools
import seaborn as sns
import matplotlib.pyplot as plt
from statistics import mean, stdev, median, mode
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from sklearn.datasets import make_classification
from imblearn.combine import SMOTEENN
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import SMOTENC
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.under_sampling import RepeatedEditedNearestNeighbours
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator
from sklearn.impute import KNNImputer as KNN
import json
import argparse
import re
###############################################################################
#gene = 'pncA'
#drug = 'pyrazinamide'
#total_mtblineage_uc = 8
#%% command line args: case sensitive
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '')
arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '')
args = arg_parser.parse_args()
drug = args.drug
gene = args.gene
###############################################################################
#==================
# other vars
#==================
tts_split = '70/30'
OutFile_suffix = '7030_FS'
###############################################################################
#==================
# Import data
#==================
from ml_data_7030 import *
setvars(gene,drug)
from ml_data_7030 import *
# from YC run_all_ML: run locally
#from UQ_yc_RunAllClfs import run_all_ML
#==========================================
# Import ML function: Feature selection
#==========================================
# TT run all ML clfs: feature selection
from FS import fsgs
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/tts_7030/fs/'
print('\nOutput directory:', outdir_ml)
OutFileFS = outdir_ml + gene.lower() + '_FS_' + OutFile_suffix + '.json'
############################################################################
############################################################################### ###############################################################################
#==================== #====================
# single model CALL # single model CALL
#==================== #====================
a_fs0 = fsgs(input_df = X # aFS = fsgs(input_df = X
, target = y # , target = y
, param_gridLd = [{'fs__min_features_to_select' : [1]}] # , param_gridLd = [{'fs__min_features_to_select': [1]}]
, blind_test_df = X_bts # , blind_test_df = X_bts
, blind_test_target = y_bts # , blind_test_target = y_bts
, estimator = LogisticRegression(**rs) # , estimator = LogisticRegression(**rs)
, use_fs = False # uses estimator as the RFECV parameter for fs. Set to TRUE if you want to supply custom_fs as shown below # , use_fs = False # uses estimator as the RFECV parameter for fs. Set to TRUE if you want to supply custom_fs as shown below
, custom_fs = RFECV(DecisionTreeClassifier(**rs) , cv = skf_cv, scoring = 'matthews_corrcoef') # , custom_fs = RFECV(DecisionTreeClassifier(**rs) , cv = skf_cv, scoring = 'matthews_corrcoef')
, cv_method = skf_cv # , cv_method = skf_cv
, var_type = 'mixed' # , var_type = 'mixed'
) # )
#############
# Loop
############
# models_all = [
# ('XGBoost' , XGBClassifier(**rs, **njobs
# , n_estimators = 100 # wasn't there
# , max_depyth = 3 # wasn't there
# , verbosity = 3
# #, use_label_encoder = False)
# ) )
# ]
models = [('AdaBoost Classifier' , AdaBoostClassifier(**rs) )
##, ('Bagging Classifier' , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True) )
, ('Decision Tree' , DecisionTreeClassifier(**rs) )
, ('Extra Tree' , ExtraTreeClassifier(**rs) )
, ('Extra Trees' , ExtraTreesClassifier(**rs) )
, ('Gradient Boosting' , GradientBoostingClassifier(**rs) )
#, ('Gaussian NB' , GaussianNB() )
#, ('Gaussian Process' , GaussianProcessClassifier(**rs) )
#, ('K-Nearest Neighbors' , KNeighborsClassifier() )
, ('LDA' , LinearDiscriminantAnalysis() )
, ('Logistic Regression' , LogisticRegression(**rs) )
, ('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs))
#, ('MLP' , MLPClassifier(max_iter = 500, **rs) )
#, ('Multinomial' , MultinomialNB() )
#, ('Naive Bayes' , BernoulliNB() )
, ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) )
#, ('QDA' , QuadraticDiscriminantAnalysis() )
, ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000 ) )
, ('Random Forest2' , RandomForestClassifier(min_samples_leaf = 5
, n_estimators = 1000
, bootstrap = True
, oob_score = True
, **njobs
, **rs
, max_features = 'auto') )
, ('Ridge Classifier' , RidgeClassifier(**rs) )
, ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) )
#, ('SVC' , SVC(**rs) )
, ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) )
# , ('XGBoost' , XGBClassifier(**rs, **njobs, verbosity = 3
# , use_label_encoder = False) )
]
print('\n#####################################################################'
, '\nRunning Feature Selection using classfication models (n):', len(models)
, '\nGene:' , gene.lower()
, '\nDrug:' , drug
, '\nSplit:' , tts_split
,'\n####################################################################')
for m in models:
print(m)
print('\n====================================================================\n')
out_fsD = {}
index = 1
for model_name, model_fn in models:
print('\nRunning classifier with FS:', index
, '\nModel_name:' , model_name
, '\nModel func:' , model_fn)
#, '\nList of models:', models)
index = index+1
out_fsD[model_name] = fsgs(input_df = X
, target = y
, param_gridLd = [{'fs__min_features_to_select': [1]}]
, blind_test_df = X_bts
, blind_test_target = y_bts
, estimator = model_fn
, use_fs = False # uses estimator as the RFECV parameter for fs. Set to TRUE if you want to supply custom_fs as shown below
, custom_fs = RFECV(DecisionTreeClassifier(**rs) , cv = skf_cv, scoring = 'matthews_corrcoef')
, cv_method = skf_cv
, var_type = 'mixed'
)
out_fsD
#%% Checking results dict
tot_Ditems = sum(len(v) for v in out_fsD.values())
checkL = []
for k, v in out_fsD.items():
l = [len(out_fsD[k])]
checkL = checkL + l
n_sD = len(checkL) # no. of subDicts
l_sD = list(set(checkL)) # length of each subDict
print('\nTotal no.of subdicts:', n_sD)
if len(l_sD) == 1 and tot_Ditems == n_sD*l_sD[0]:
print('\nPASS: successful run for all Classifiers'
, '\nLength of each subdict:', l_sD)
print('\nSuccessfully ran Feature selection on', len(models), 'classifiers'
, '\nGene:', gene.lower()
, '\nDrug:', drug
, '\nSplit type:', tts_split
, '\nTotal fs models results:', len(out_fsD)
, '\nTotal items in output:', sum(len(v) for v in out_fsD.values()) )
############################################################################## ##############################################################################
@ -52,14 +241,15 @@ a_fs0 = fsgs(input_df = X
# Write final output file # Write final output file
# https://stackoverflow.com/questions/19201290/how-to-save-a-dictionary-to-a-file # https://stackoverflow.com/questions/19201290/how-to-save-a-dictionary-to-a-file
#======================================== #========================================
# #output final dict as a json # Output final dict as a json
# outFile = 'LR_FS.json' print('\nWriting Final output file (json):', OutFileFS)
# with open(outFile, 'w') as f: with open(OutFileFS, 'w') as f:
# f.write(json.dumps(output_modelD,cls=NpEncoder)) f.write(json.dumps(out_fsD
# , cls = NpEncoder
))
# # read json # # read json
# file = 'LR_FS.json' # with open(OutFileFS, 'r') as f:
# with open(file, 'r') as f:
# data = json.load(f) # data = json.load(f)
############################################################################## ##############################################################################

View file

@ -5,12 +5,12 @@
# captures error: 2>$1 # captures error: 2>$1
# omitted drtype_labels # omitted drtype_labels
================================= =================================
./run_7030.py -g pncA -d pyrazinamide 2>&1 | tee log_pnca_7030.txt time ./run_7030.py -g pncA -d pyrazinamide 2>&1 | tee log_pnca_7030.txt
./run_7030.py -g embB -d ethambutol 2>&1 | tee log_embb_7030.txt time ./run_7030.py -g embB -d ethambutol 2>&1 | tee log_embb_7030.txt
./run_7030.py -g katG -d isoniazid 2>&1 | tee log_katg_7030.txt time ./run_7030.py -g katG -d isoniazid 2>&1 | tee log_katg_7030.txt
./run_7030.py -g rpoB -d rifampicin 2>&1 | tee log_rpob_7030.txt time ./run_7030.py -g rpoB -d rifampicin 2>&1 | tee log_rpob_7030.txt
./run_7030.py -g gid -d streptomycin 2>&1 | tee log_gid_7030.txt time ./run_7030.py -g gid -d streptomycin 2>&1 | tee log_gid_7030.txt
./run_7030.py -g alr -d cycloserine 2>&1 | tee log_alr_7030.txt time ./run_7030.py -g alr -d cycloserine 2>&1 | tee log_alr_7030.txt
# alr: # ERROR, as expected, too few values! # alr: # ERROR, as expected, too few values!
# gid: problems # gid: problems
######################################################################## ########################################################################
@ -69,5 +69,13 @@
# Date: 18/05/2022 # Date: 18/05/2022
# captures error: 2>$1 # captures error: 2>$1
================================= =================================
########################################################################
########################################################################
########################################################################
# running feature selection
# Split:70/30
time ./run_FS.py -g pncA -d pyrazinamide 2>&1 | tee log_FS_pnca_7030.txt

View file

@ -61,9 +61,6 @@ a_fs0 = fsgs(input_df = X
, var_type = 'mixed' , var_type = 'mixed'
) )
############################################### ###############################################
############################################################################## ##############################################################################
# my function CALL # my function CALL
#import fsgs from UQ_FS_fn #import fsgs from UQ_FS_fn