LSHTM_analysis/scripts/ml/run_FS.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue May 24 08:11:05 2022

@author: tanu
"""
#%%
import os, sys
import pandas as pd
import numpy as np
import pprint as pp
from copy import deepcopy
from sklearn import linear_model
from sklearn import datasets
from collections import Counter

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.linear_model import RidgeClassifier, RidgeClassifierCV, SGDClassifier, PassiveAggressiveClassifier

from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.gaussian_process import GaussianProcessClassifier, kernels
from sklearn.gaussian_process.kernels import RBF, DotProduct, Matern, RationalQuadratic, WhiteKernel

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier

from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder

from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer

from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score
from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report

# added
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, LeaveOneOut, KFold, RepeatedKFold, cross_val_predict

from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold

from sklearn.pipeline import Pipeline, make_pipeline

from sklearn.feature_selection import RFE, RFECV

import itertools
import seaborn as sns
import matplotlib.pyplot as plt

from statistics import mean, stdev, median, mode

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from sklearn.datasets import make_classification
from imblearn.combine import SMOTEENN
from imblearn.combine import SMOTETomek

from imblearn.over_sampling import SMOTENC
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.under_sampling import RepeatedEditedNearestNeighbours

from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator
from sklearn.impute import KNNImputer as KNN
import json
import argparse
import re
###############################################################################
#gene  = 'pncA'
#drug  = 'pyrazinamide'
#total_mtblineage_uc = 8

#%% command line args: case sensitive
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '')
arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '')
args = arg_parser.parse_args()

drug    = args.drug
gene    = args.gene

###############################################################################
#==================
# other vars
#==================
tts_split    = '70_30'
OutFile_suffix  = '7030_FS'
###############################################################################
#==================
# Import data
#==================
from ml_data_7030 import *
setvars(gene,drug)
from ml_data_7030 import *

# from YC run_all_ML: run locally
#from UQ_yc_RunAllClfs import run_all_ML

#==========================================
# Import ML function: Feature selection
#==========================================
# TT run all ML clfs: feature selection
from FS import fsgs

#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/tts_7030/fs/'
print('\nOutput directory:', outdir_ml)
#OutFileFS = outdir_ml + gene.lower() + '_FS' + OutFile_suffix + '.json'
OutFileFS = outdir_ml + gene.lower() + '_FS_noOR' + OutFile_suffix + '.json'

############################################################################

###############################################################################
#====================
# single model CALL
#====================
# aFS = fsgs(input_df = X
#          , target = y
#          , param_gridLd = [{'fs__min_features_to_select': [1]}]
#          , blind_test_df = X_bts
#          , blind_test_target = y_bts
#          , estimator = LogisticRegression(**rs)
#          , use_fs = False # uses estimator as the RFECV parameter for fs. Set to TRUE if you want to supply custom_fs as shown below
#          , custom_fs = RFECV(DecisionTreeClassifier(**rs) , cv =  skf_cv, scoring = 'matthews_corrcoef')
#          , cv_method =  skf_cv
#          , var_type = 'mixed'
#          )
#############
# Loop
############
# models_all = [
#           ('XGBoost'                   , XGBClassifier(**rs, **njobs
#                                                        , n_estimators = 100 # wasn't there
#                                                        , max_depyth = 3 # wasn't there
#                                                        , verbosity = 3
#                                                        #, use_label_encoder = False)
#                                                        ) )
# ]

models = [('AdaBoost Classifier'     , AdaBoostClassifier(**rs) )
          ##, ('Bagging Classifier'        , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True) )
          , ('Decision Tree'             , DecisionTreeClassifier(**rs) )
          , ('Extra Tree'                , ExtraTreeClassifier(**rs) )
          , ('Extra Trees'               , ExtraTreesClassifier(**rs) )
          , ('Gradient Boosting'         , GradientBoostingClassifier(**rs) )
          ##, ('Gaussian NB'               , GaussianNB() )
          ##, ('Gaussian Process'          , GaussianProcessClassifier(**rs) )
          ##, ('K-Nearest Neighbors'       , KNeighborsClassifier() )
          , ('LDA'                       , LinearDiscriminantAnalysis() )
          , ('Logistic Regression'       , LogisticRegression(**rs) )
          , ('Logistic RegressionCV'     , LogisticRegressionCV(cv = 3, **rs))
          ##, ('MLP'                       , MLPClassifier(max_iter = 500, **rs) )
          ##, ('Multinomial'               , MultinomialNB() )
          ##, ('Naive Bayes'               , BernoulliNB() )
          , ('Passive Aggresive'         , PassiveAggressiveClassifier(**rs, **njobs) )
          ##, ('QDA'                       , QuadraticDiscriminantAnalysis() )
          , ('Random Forest'             , RandomForestClassifier(**rs, n_estimators = 1000 ) )
          , ('Random Forest2'            , RandomForestClassifier(min_samples_leaf = 5
                                                                 , n_estimators     = 1000
                                                                 , bootstrap        = True
                                                                 , oob_score        = True
                                                                 , **njobs
                                                                 , **rs
                                                                 , max_features     = 'auto') )
          , ('Ridge Classifier'          , RidgeClassifier(**rs)  )
          , ('Ridge ClassifierCV'        , RidgeClassifierCV(cv = 3) )
          ##, ('SVC'                       , SVC(**rs) )
          , ('Stochastic GDescent'       , SGDClassifier(**rs, **njobs) )
          ## , ('XGBoost'                   , XGBClassifier(**rs, **njobs, verbosity = 3
          ##                                                , use_label_encoder = False) )
          ]

print('\n#####################################################################'
      , '\nRunning Feature Selection using classfication models (n):', len(models)
      , '\nGene:'  , gene.lower()
      , '\nDrug:'  , drug
      , '\nSplit:' , tts_split
      ,'\n####################################################################')

for m in models:
    print(m)
print('\n====================================================================\n')

out_fsD = {}
index = 1
for model_name, model_fn in models:
    print('\nRunning classifier with FS:', index
          , '\nModel_name:'               , model_name
          , '\nModel func:'               , model_fn)
          #, '\nList of models:', models)
    index = index+1

    out_fsD[model_name] = fsgs(input_df = X
              , target = y
              , param_gridLd = [{'fs__min_features_to_select': [1]}]
              , blind_test_df = X_bts
              , blind_test_target = y_bts
              , estimator = model_fn
              , use_fs = False # uses estimator as the RFECV parameter for fs. Set to TRUE if you want to supply custom_fs as shown below
              , custom_fs = RFECV(DecisionTreeClassifier(**rs) , cv =  skf_cv, scoring = 'matthews_corrcoef')
              , cv_method =  skf_cv
              , var_type = 'mixed'
              )
out_fsD
#%% Checking results dict
tot_Ditems = sum(len(v) for v in out_fsD.values())

checkL = []
for k, v in out_fsD.items():
    l = [len(out_fsD[k])]
    checkL = checkL + l
    n_sD = len(checkL) # no. of subDicts
    l_sD = list(set(checkL)) # length of each subDict

print('\nTotal no.of subdicts:', n_sD)
if len(l_sD) == 1 and tot_Ditems == n_sD*l_sD[0]:
    print('\nPASS: successful run for all Classifiers'
          , '\nLength of each subdict:', l_sD)

print('\nSuccessfully ran Feature selection on', len(models), 'classifiers'
      , '\nGene:', gene.lower()
      , '\nDrug:', drug
      , '\nSplit type:', tts_split
      , '\nTotal fs models results:', len(out_fsD)
      , '\nTotal items in output:', sum(len(v) for v in out_fsD.values()) )


##############################################################################
#%% json output
#========================================
# Write final output file
# https://stackoverflow.com/questions/19201290/how-to-save-a-dictionary-to-a-file
#========================================
# Output final dict as a json
print('\nWriting Final output file (json):', OutFileFS)
with open(OutFileFS, 'w') as f:
    f.write(json.dumps(out_fsD
#                       , cls = NpEncoder
))

# read json
with open(OutFileFS, 'r') as f:data = json.load(f)
##############################################################################