#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Tue May 24 08:11:05 2022 @author: tanu """ #%% import os, sys import pandas as pd import numpy as np import pprint as pp from copy import deepcopy from sklearn import linear_model from sklearn import datasets from collections import Counter from sklearn.linear_model import LogisticRegression, LogisticRegressionCV from sklearn.linear_model import RidgeClassifier, RidgeClassifierCV, SGDClassifier, PassiveAggressiveClassifier from sklearn.naive_bayes import BernoulliNB from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier from sklearn.naive_bayes import GaussianNB from sklearn.gaussian_process import GaussianProcessClassifier, kernels from sklearn.gaussian_process.kernels import RBF, DotProduct, Matern, RationalQuadratic, WhiteKernel from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis from sklearn.neural_network import MLPClassifier from sklearn.svm import SVC from xgboost import XGBClassifier from sklearn.naive_bayes import MultinomialNB from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder from sklearn.compose import ColumnTransformer from sklearn.compose import make_column_transformer from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report # added from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, LeaveOneOut, KFold, RepeatedKFold, cross_val_predict from sklearn.model_selection import train_test_split, cross_validate, cross_val_score from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold from sklearn.pipeline import Pipeline, make_pipeline from sklearn.feature_selection import RFE, RFECV import itertools import seaborn as sns import matplotlib.pyplot as plt from statistics import mean, stdev, median, mode from imblearn.over_sampling import RandomOverSampler from imblearn.under_sampling import RandomUnderSampler from imblearn.over_sampling import SMOTE from sklearn.datasets import make_classification from imblearn.combine import SMOTEENN from imblearn.combine import SMOTETomek from imblearn.over_sampling import SMOTENC from imblearn.under_sampling import EditedNearestNeighbours from imblearn.under_sampling import RepeatedEditedNearestNeighbours from sklearn.model_selection import GridSearchCV from sklearn.base import BaseEstimator from sklearn.impute import KNNImputer as KNN import json import argparse import re ############################################################################### #gene = 'pncA' #drug = 'pyrazinamide' #total_mtblineage_uc = 8 #%% command line args: case sensitive arg_parser = argparse.ArgumentParser() arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '') arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '') args = arg_parser.parse_args() drug = args.drug gene = args.gene ############################################################################### #================== # other vars #================== tts_split = '70/30' OutFile_suffix = '7030_FS' ############################################################################### #================== # Import data #================== from ml_data_7030 import * setvars(gene,drug) from ml_data_7030 import * # from YC run_all_ML: run locally #from UQ_yc_RunAllClfs import run_all_ML #========================================== # Import ML function: Feature selection #========================================== # TT run all ML clfs: feature selection from FS import fsgs #================== # Specify outdir #================== outdir_ml = outdir + 'ml/tts_7030/fs/' print('\nOutput directory:', outdir_ml) OutFileFS = outdir_ml + gene.lower() + '_FS_' + OutFile_suffix + '.json' ############################################################################ ############################################################################### #==================== # single model CALL #==================== # aFS = fsgs(input_df = X # , target = y # , param_gridLd = [{'fs__min_features_to_select': [1]}] # , blind_test_df = X_bts # , blind_test_target = y_bts # , estimator = LogisticRegression(**rs) # , use_fs = False # uses estimator as the RFECV parameter for fs. Set to TRUE if you want to supply custom_fs as shown below # , custom_fs = RFECV(DecisionTreeClassifier(**rs) , cv = skf_cv, scoring = 'matthews_corrcoef') # , cv_method = skf_cv # , var_type = 'mixed' # ) ############# # Loop ############ # models_all = [ # ('XGBoost' , XGBClassifier(**rs, **njobs # , n_estimators = 100 # wasn't there # , max_depyth = 3 # wasn't there # , verbosity = 3 # #, use_label_encoder = False) # ) ) # ] models = [('AdaBoost Classifier' , AdaBoostClassifier(**rs) ) ##, ('Bagging Classifier' , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True) ) , ('Decision Tree' , DecisionTreeClassifier(**rs) ) , ('Extra Tree' , ExtraTreeClassifier(**rs) ) , ('Extra Trees' , ExtraTreesClassifier(**rs) ) , ('Gradient Boosting' , GradientBoostingClassifier(**rs) ) #, ('Gaussian NB' , GaussianNB() ) #, ('Gaussian Process' , GaussianProcessClassifier(**rs) ) #, ('K-Nearest Neighbors' , KNeighborsClassifier() ) , ('LDA' , LinearDiscriminantAnalysis() ) , ('Logistic Regression' , LogisticRegression(**rs) ) , ('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs)) #, ('MLP' , MLPClassifier(max_iter = 500, **rs) ) #, ('Multinomial' , MultinomialNB() ) #, ('Naive Bayes' , BernoulliNB() ) , ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) ) #, ('QDA' , QuadraticDiscriminantAnalysis() ) , ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000 ) ) , ('Random Forest2' , RandomForestClassifier(min_samples_leaf = 5 , n_estimators = 1000 , bootstrap = True , oob_score = True , **njobs , **rs , max_features = 'auto') ) , ('Ridge Classifier' , RidgeClassifier(**rs) ) , ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) ) #, ('SVC' , SVC(**rs) ) , ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) ) # , ('XGBoost' , XGBClassifier(**rs, **njobs, verbosity = 3 # , use_label_encoder = False) ) ] print('\n#####################################################################' , '\nRunning Feature Selection using classfication models (n):', len(models) , '\nGene:' , gene.lower() , '\nDrug:' , drug , '\nSplit:' , tts_split ,'\n####################################################################') for m in models: print(m) print('\n====================================================================\n') out_fsD = {} index = 1 for model_name, model_fn in models: print('\nRunning classifier with FS:', index , '\nModel_name:' , model_name , '\nModel func:' , model_fn) #, '\nList of models:', models) index = index+1 out_fsD[model_name] = fsgs(input_df = X , target = y , param_gridLd = [{'fs__min_features_to_select': [1]}] , blind_test_df = X_bts , blind_test_target = y_bts , estimator = model_fn , use_fs = False # uses estimator as the RFECV parameter for fs. Set to TRUE if you want to supply custom_fs as shown below , custom_fs = RFECV(DecisionTreeClassifier(**rs) , cv = skf_cv, scoring = 'matthews_corrcoef') , cv_method = skf_cv , var_type = 'mixed' ) out_fsD #%% Checking results dict tot_Ditems = sum(len(v) for v in out_fsD.values()) checkL = [] for k, v in out_fsD.items(): l = [len(out_fsD[k])] checkL = checkL + l n_sD = len(checkL) # no. of subDicts l_sD = list(set(checkL)) # length of each subDict print('\nTotal no.of subdicts:', n_sD) if len(l_sD) == 1 and tot_Ditems == n_sD*l_sD[0]: print('\nPASS: successful run for all Classifiers' , '\nLength of each subdict:', l_sD) print('\nSuccessfully ran Feature selection on', len(models), 'classifiers' , '\nGene:', gene.lower() , '\nDrug:', drug , '\nSplit type:', tts_split , '\nTotal fs models results:', len(out_fsD) , '\nTotal items in output:', sum(len(v) for v in out_fsD.values()) ) ############################################################################## #%% json output #======================================== # Write final output file # https://stackoverflow.com/questions/19201290/how-to-save-a-dictionary-to-a-file #======================================== # Output final dict as a json print('\nWriting Final output file (json):', OutFileFS) with open(OutFileFS, 'w') as f: f.write(json.dumps(out_fsD # , cls = NpEncoder )) # # read json # with open(OutFileFS, 'r') as f: # data = json.load(f) ##############################################################################