added metadata output for running multiple models
This commit is contained in:
parent
5dea35f97c
commit
4fe62c072b
7 changed files with 325 additions and 88 deletions
|
@ -5,45 +5,234 @@ Created on Tue May 24 08:11:05 2022
|
|||
|
||||
@author: tanu
|
||||
"""
|
||||
#%%
|
||||
import os, sys
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import pprint as pp
|
||||
from copy import deepcopy
|
||||
from sklearn import linear_model
|
||||
from sklearn import datasets
|
||||
from collections import Counter
|
||||
|
||||
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
|
||||
from sklearn.linear_model import RidgeClassifier, RidgeClassifierCV, SGDClassifier, PassiveAggressiveClassifier
|
||||
|
||||
from sklearn.naive_bayes import BernoulliNB
|
||||
from sklearn.neighbors import KNeighborsClassifier
|
||||
from sklearn.svm import SVC
|
||||
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
|
||||
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier
|
||||
from sklearn.naive_bayes import GaussianNB
|
||||
from sklearn.gaussian_process import GaussianProcessClassifier, kernels
|
||||
from sklearn.gaussian_process.kernels import RBF, DotProduct, Matern, RationalQuadratic, WhiteKernel
|
||||
|
||||
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
|
||||
from sklearn.neural_network import MLPClassifier
|
||||
|
||||
from sklearn.svm import SVC
|
||||
from xgboost import XGBClassifier
|
||||
from sklearn.naive_bayes import MultinomialNB
|
||||
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
|
||||
|
||||
from sklearn.compose import ColumnTransformer
|
||||
from sklearn.compose import make_column_transformer
|
||||
|
||||
from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score
|
||||
from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report
|
||||
|
||||
# added
|
||||
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, LeaveOneOut, KFold, RepeatedKFold, cross_val_predict
|
||||
|
||||
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
|
||||
from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold
|
||||
|
||||
from sklearn.pipeline import Pipeline, make_pipeline
|
||||
|
||||
from sklearn.feature_selection import RFE, RFECV
|
||||
|
||||
import itertools
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
from statistics import mean, stdev, median, mode
|
||||
|
||||
from imblearn.over_sampling import RandomOverSampler
|
||||
from imblearn.under_sampling import RandomUnderSampler
|
||||
from imblearn.over_sampling import SMOTE
|
||||
from sklearn.datasets import make_classification
|
||||
from imblearn.combine import SMOTEENN
|
||||
from imblearn.combine import SMOTETomek
|
||||
|
||||
from imblearn.over_sampling import SMOTENC
|
||||
from imblearn.under_sampling import EditedNearestNeighbours
|
||||
from imblearn.under_sampling import RepeatedEditedNearestNeighbours
|
||||
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.impute import KNNImputer as KNN
|
||||
import json
|
||||
import argparse
|
||||
import re
|
||||
###############################################################################
|
||||
#gene = 'pncA'
|
||||
#drug = 'pyrazinamide'
|
||||
#total_mtblineage_uc = 8
|
||||
|
||||
#%% command line args: case sensitive
|
||||
arg_parser = argparse.ArgumentParser()
|
||||
arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '')
|
||||
arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '')
|
||||
args = arg_parser.parse_args()
|
||||
|
||||
drug = args.drug
|
||||
gene = args.gene
|
||||
|
||||
###############################################################################
|
||||
#==================
|
||||
# other vars
|
||||
#==================
|
||||
tts_split = '70/30'
|
||||
OutFile_suffix = '7030_FS'
|
||||
###############################################################################
|
||||
#==================
|
||||
# Import data
|
||||
#==================
|
||||
from ml_data_7030 import *
|
||||
setvars(gene,drug)
|
||||
from ml_data_7030 import *
|
||||
|
||||
# from YC run_all_ML: run locally
|
||||
#from UQ_yc_RunAllClfs import run_all_ML
|
||||
|
||||
#==========================================
|
||||
# Import ML function: Feature selection
|
||||
#==========================================
|
||||
# TT run all ML clfs: feature selection
|
||||
from FS import fsgs
|
||||
|
||||
#==================
|
||||
# Specify outdir
|
||||
#==================
|
||||
outdir_ml = outdir + 'ml/tts_7030/fs/'
|
||||
print('\nOutput directory:', outdir_ml)
|
||||
OutFileFS = outdir_ml + gene.lower() + '_FS_' + OutFile_suffix + '.json'
|
||||
|
||||
############################################################################
|
||||
|
||||
###############################################################################
|
||||
#====================
|
||||
# single model CALL
|
||||
#====================
|
||||
a_fs0 = fsgs(input_df = X
|
||||
, target = y
|
||||
, param_gridLd = [{'fs__min_features_to_select' : [1]}]
|
||||
, blind_test_df = X_bts
|
||||
, blind_test_target = y_bts
|
||||
, estimator = LogisticRegression(**rs)
|
||||
, use_fs = False # uses estimator as the RFECV parameter for fs. Set to TRUE if you want to supply custom_fs as shown below
|
||||
, custom_fs = RFECV(DecisionTreeClassifier(**rs) , cv = skf_cv, scoring = 'matthews_corrcoef')
|
||||
, cv_method = skf_cv
|
||||
, var_type = 'mixed'
|
||||
)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# aFS = fsgs(input_df = X
|
||||
# , target = y
|
||||
# , param_gridLd = [{'fs__min_features_to_select': [1]}]
|
||||
# , blind_test_df = X_bts
|
||||
# , blind_test_target = y_bts
|
||||
# , estimator = LogisticRegression(**rs)
|
||||
# , use_fs = False # uses estimator as the RFECV parameter for fs. Set to TRUE if you want to supply custom_fs as shown below
|
||||
# , custom_fs = RFECV(DecisionTreeClassifier(**rs) , cv = skf_cv, scoring = 'matthews_corrcoef')
|
||||
# , cv_method = skf_cv
|
||||
# , var_type = 'mixed'
|
||||
# )
|
||||
#############
|
||||
# Loop
|
||||
############
|
||||
# models_all = [
|
||||
# ('XGBoost' , XGBClassifier(**rs, **njobs
|
||||
# , n_estimators = 100 # wasn't there
|
||||
# , max_depyth = 3 # wasn't there
|
||||
# , verbosity = 3
|
||||
# #, use_label_encoder = False)
|
||||
# ) )
|
||||
# ]
|
||||
|
||||
models = [('AdaBoost Classifier' , AdaBoostClassifier(**rs) )
|
||||
##, ('Bagging Classifier' , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True) )
|
||||
, ('Decision Tree' , DecisionTreeClassifier(**rs) )
|
||||
, ('Extra Tree' , ExtraTreeClassifier(**rs) )
|
||||
, ('Extra Trees' , ExtraTreesClassifier(**rs) )
|
||||
, ('Gradient Boosting' , GradientBoostingClassifier(**rs) )
|
||||
#, ('Gaussian NB' , GaussianNB() )
|
||||
#, ('Gaussian Process' , GaussianProcessClassifier(**rs) )
|
||||
#, ('K-Nearest Neighbors' , KNeighborsClassifier() )
|
||||
, ('LDA' , LinearDiscriminantAnalysis() )
|
||||
, ('Logistic Regression' , LogisticRegression(**rs) )
|
||||
, ('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs))
|
||||
#, ('MLP' , MLPClassifier(max_iter = 500, **rs) )
|
||||
#, ('Multinomial' , MultinomialNB() )
|
||||
#, ('Naive Bayes' , BernoulliNB() )
|
||||
, ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) )
|
||||
#, ('QDA' , QuadraticDiscriminantAnalysis() )
|
||||
, ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000 ) )
|
||||
, ('Random Forest2' , RandomForestClassifier(min_samples_leaf = 5
|
||||
, n_estimators = 1000
|
||||
, bootstrap = True
|
||||
, oob_score = True
|
||||
, **njobs
|
||||
, **rs
|
||||
, max_features = 'auto') )
|
||||
, ('Ridge Classifier' , RidgeClassifier(**rs) )
|
||||
, ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) )
|
||||
#, ('SVC' , SVC(**rs) )
|
||||
, ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) )
|
||||
# , ('XGBoost' , XGBClassifier(**rs, **njobs, verbosity = 3
|
||||
# , use_label_encoder = False) )
|
||||
]
|
||||
|
||||
print('\n#####################################################################'
|
||||
, '\nRunning Feature Selection using classfication models (n):', len(models)
|
||||
, '\nGene:' , gene.lower()
|
||||
, '\nDrug:' , drug
|
||||
, '\nSplit:' , tts_split
|
||||
,'\n####################################################################')
|
||||
|
||||
for m in models:
|
||||
print(m)
|
||||
print('\n====================================================================\n')
|
||||
|
||||
out_fsD = {}
|
||||
index = 1
|
||||
for model_name, model_fn in models:
|
||||
print('\nRunning classifier with FS:', index
|
||||
, '\nModel_name:' , model_name
|
||||
, '\nModel func:' , model_fn)
|
||||
#, '\nList of models:', models)
|
||||
index = index+1
|
||||
|
||||
out_fsD[model_name] = fsgs(input_df = X
|
||||
, target = y
|
||||
, param_gridLd = [{'fs__min_features_to_select': [1]}]
|
||||
, blind_test_df = X_bts
|
||||
, blind_test_target = y_bts
|
||||
, estimator = model_fn
|
||||
, use_fs = False # uses estimator as the RFECV parameter for fs. Set to TRUE if you want to supply custom_fs as shown below
|
||||
, custom_fs = RFECV(DecisionTreeClassifier(**rs) , cv = skf_cv, scoring = 'matthews_corrcoef')
|
||||
, cv_method = skf_cv
|
||||
, var_type = 'mixed'
|
||||
)
|
||||
out_fsD
|
||||
#%% Checking results dict
|
||||
tot_Ditems = sum(len(v) for v in out_fsD.values())
|
||||
|
||||
checkL = []
|
||||
for k, v in out_fsD.items():
|
||||
l = [len(out_fsD[k])]
|
||||
checkL = checkL + l
|
||||
n_sD = len(checkL) # no. of subDicts
|
||||
l_sD = list(set(checkL)) # length of each subDict
|
||||
|
||||
print('\nTotal no.of subdicts:', n_sD)
|
||||
if len(l_sD) == 1 and tot_Ditems == n_sD*l_sD[0]:
|
||||
print('\nPASS: successful run for all Classifiers'
|
||||
, '\nLength of each subdict:', l_sD)
|
||||
|
||||
print('\nSuccessfully ran Feature selection on', len(models), 'classifiers'
|
||||
, '\nGene:', gene.lower()
|
||||
, '\nDrug:', drug
|
||||
, '\nSplit type:', tts_split
|
||||
, '\nTotal fs models results:', len(out_fsD)
|
||||
, '\nTotal items in output:', sum(len(v) for v in out_fsD.values()) )
|
||||
|
||||
|
||||
##############################################################################
|
||||
|
@ -52,14 +241,15 @@ a_fs0 = fsgs(input_df = X
|
|||
# Write final output file
|
||||
# https://stackoverflow.com/questions/19201290/how-to-save-a-dictionary-to-a-file
|
||||
#========================================
|
||||
# #output final dict as a json
|
||||
# outFile = 'LR_FS.json'
|
||||
# with open(outFile, 'w') as f:
|
||||
# f.write(json.dumps(output_modelD,cls=NpEncoder))
|
||||
# Output final dict as a json
|
||||
print('\nWriting Final output file (json):', OutFileFS)
|
||||
with open(OutFileFS, 'w') as f:
|
||||
f.write(json.dumps(out_fsD
|
||||
# , cls = NpEncoder
|
||||
))
|
||||
|
||||
# # read json
|
||||
# file = 'LR_FS.json'
|
||||
# with open(file, 'r') as f:
|
||||
# with open(OutFileFS, 'r') as f:
|
||||
# data = json.load(f)
|
||||
##############################################################################
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue