From 652cf4802ea38edcdd85664fa803260e224e825e Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Tue, 5 Jul 2022 14:19:35 +0100
Subject: [PATCH] added MultClfs_fi to add FI scores for models, in development

---
 scripts/ml/ml_functions/MultClfs_fi.py | 323 +++++++++++++++++++++++++
 1 file changed, 323 insertions(+)
 create mode 100644 scripts/ml/ml_functions/MultClfs_fi.py

diff --git a/scripts/ml/ml_functions/MultClfs_fi.py b/scripts/ml/ml_functions/MultClfs_fi.py
new file mode 100644
index 0000000..3803bd7
--- /dev/null
+++ b/scripts/ml/ml_functions/MultClfs_fi.py
@@ -0,0 +1,323 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Mar  4 15:25:33 2022
+
+@author: tanu
+"""
+#%%
+import os, sys
+import pandas as pd
+import numpy as np
+import pprint as pp
+from copy import deepcopy
+from sklearn import linear_model
+from sklearn import datasets
+from collections import Counter
+
+from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
+from sklearn.linear_model import RidgeClassifier, RidgeClassifierCV, SGDClassifier, PassiveAggressiveClassifier
+
+from sklearn.naive_bayes import BernoulliNB
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.svm import SVC
+from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
+from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier
+from sklearn.naive_bayes import GaussianNB
+from sklearn.gaussian_process import GaussianProcessClassifier, kernels
+from sklearn.gaussian_process.kernels import RBF, DotProduct, Matern, RationalQuadratic, WhiteKernel
+
+from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
+from sklearn.neural_network import MLPClassifier
+
+from sklearn.svm import SVC
+from xgboost import XGBClassifier
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
+
+from sklearn.compose import ColumnTransformer
+from sklearn.compose import make_column_transformer
+
+from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score
+from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report
+
+# added
+from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, LeaveOneOut, KFold, RepeatedKFold, cross_val_predict
+
+from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
+from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold
+
+from sklearn.pipeline import Pipeline, make_pipeline
+
+from sklearn.feature_selection import RFE, RFECV
+
+import itertools
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+from statistics import mean, stdev, median, mode
+
+from imblearn.over_sampling import RandomOverSampler
+from imblearn.under_sampling import RandomUnderSampler
+from imblearn.over_sampling import SMOTE
+from sklearn.datasets import make_classification
+from imblearn.combine import SMOTEENN
+from imblearn.combine import SMOTETomek
+
+from imblearn.over_sampling import SMOTENC
+from imblearn.under_sampling import EditedNearestNeighbours
+from imblearn.under_sampling import RepeatedEditedNearestNeighbours
+
+from sklearn.model_selection import GridSearchCV
+from sklearn.base import BaseEstimator
+from sklearn.impute import KNNImputer as KNN
+import json
+import argparse
+import re
+#%% GLOBALS
+rs = {'random_state': 42}
+njobs = {'n_jobs': os.cpu_count() } # the number of jobs should equal the number of CPU cores
+
+scoring_fn =  ({ 'mcc'        : make_scorer(matthews_corrcoef)
+                , 'fscore'    : make_scorer(f1_score)
+                , 'precision' : make_scorer(precision_score)
+                , 'recall'    : make_scorer(recall_score)
+                , 'accuracy'  : make_scorer(accuracy_score)
+                , 'roc_auc'   : make_scorer(roc_auc_score)
+                , 'jcc'       : make_scorer(jaccard_score)
+            }) 
+  
+skf_cv = StratifiedKFold(n_splits = 10
+                          #, shuffle = False, random_state= None)
+                           , shuffle = True,**rs)
+
+rskf_cv = RepeatedStratifiedKFold(n_splits = 10
+                                  , n_repeats = 3
+                                  , **rs)
+
+mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)}
+jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
+###############################################################################
+homedir = os.path.expanduser("~")
+sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml/ml_functions')
+sys.path
+###############################################################################
+outdir = homedir 
+
+from GetMLData import *
+from SplitTTS import *
+
+
+def remove(string):
+    return(string.replace(" ", ""))
+#%%############################################################################
+############################
+# MultModelsCl()
+# Run Multiple Classifiers
+############################
+# Multiple Classification - Model Pipeline
+def XGBClf(input_df, target, sel_cv
+                       , blind_test_df
+                       , blind_test_target
+                       , tts_split_type 
+
+                       , resampling_type = 'none' # default
+                       #, add_cm = True # adds confusion matrix based on cross_val_predict
+                       #, add_yn = True  # adds target var class numbers
+                       , var_type = ['numerical', 'categorical','mixed']
+                       , run_blind_test = True
+                       #, return_formatted_output = True
+                       ):
+
+    '''
+    @ param input_df: input features 
+    @ type: df with input features WITHOUT the target variable
+    
+    @param target: target (or output) feature
+    @type: df or np.array or Series
+    
+    @param skv_cv: stratifiedK fold int or object to allow shuffle and random state to pass
+    @type: int or StratifiedKfold()
+    
+    @var_type: numerical, categorical and mixed to determine what col_transform to apply (MinMaxScalar and/or one-ho    t encoder)
+    @type: list
+
+    returns
+    Dict containing multiple classification scores for each model and mean of each Stratified Kfold including training
+    '''
+
+    #======================================================
+    # Determine categorical and numerical features
+    #======================================================
+    numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
+    numerical_ix
+    categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns
+    categorical_ix    
+
+    #======================================================
+    # Determine preprocessing steps ~ var_type
+    #======================================================
+    if var_type == 'numerical':
+        t = [('num', MinMaxScaler(), numerical_ix)]
+
+    if var_type == 'categorical':
+        t = [('cat', OneHotEncoder(), categorical_ix)]
+    
+    if var_type == 'mixed':
+        t = [('num', MinMaxScaler(), numerical_ix)
+            , ('cat', OneHotEncoder(), categorical_ix) ]
+        
+    col_transform = ColumnTransformer(transformers = t
+                                       , remainder='passthrough')
+    
+    
+    
+    #======================================================
+    # Specify multiple Classification Models  
+    #======================================================
+    models = [ ('XGBoost' , XGBClassifier(**rs, verbosity = 3, use_label_encoder = False, **njobs) ) 
+              , ( 'Random Forest', RandomForestClassifier(**rs, **njobs, n_estimators = 1000))
+              , ('Logistic Regression', LogisticRegression(**rs))]    
+       
+    mm_skf_scoresD = {}
+    
+    print('\n==============================================================\n'
+          , '\nRunning several classification models (n):', len(models)
+          ,'\nList of models:')
+    for m in models:
+        print(m)
+    print('\n================================================================\n')
+    
+    index = 1
+    for model_name, model_fn in models:
+        print('\nRunning classifier:', index
+              , '\nModel_name:'               , model_name
+              , '\nModel func:'               , model_fn)
+        index = index+1
+        
+        model_pipeline = Pipeline([
+            ('prep'     , col_transform)
+            , ('model'  , model_fn)])
+            
+        print('\nRunning model pipeline:', model_pipeline)
+        skf_cv_modD = cross_validate(model_pipeline
+                              , input_df
+                              , target
+                              , cv = sel_cv
+                              , scoring = scoring_fn)
+        #==============================
+        # Extract mean values for CV 
+        #==============================
+        mm_skf_scoresD[model_name] = {}
+        
+        for key, value in skf_cv_modD.items():
+            print('\nkey:', key, '\nvalue:', value)
+            print('\nmean value:', np.mean(value))
+            mm_skf_scoresD[model_name][key] = round(np.mean(value),2)
+            
+        # ADD more info: meta data related to input df
+        mm_skf_scoresD[model_name]['resampling']        = resampling_type
+        mm_skf_scoresD[model_name]['n_training_size']   = len(input_df)
+        mm_skf_scoresD[model_name]['n_trainingY_ratio'] = round(Counter(target)[0]/Counter(target)[1], 2)
+        mm_skf_scoresD[model_name]['n_features']        = len(input_df.columns)
+        mm_skf_scoresD[model_name]['tts_split']         = tts_split_type
+        
+        # FS
+        #mnf = remove(model_name)
+        #model_pipeline.fit(input_df, target)
+        #print('\nFeature importance:', (model_pipeline.named_steps.model.feature_importances_))
+        #allf_xgboost = model_pipeline.feature_names_in_
+        #fsi_model = model_pipeline.named_steps.model.feature_importances_
+        #mm_skf_scoresD[model_name]['fs_importance'] = fsi_model
+        # TODO: add this as a key
+        #Add 
+
+        #pyplot.bar(range(len(model_pipeline.named_steps.model.feature_importances_)), model_pipeline.named_steps.model.feature_importances_)
+        #pyplot.show()
+        #plot_importance(model_pipeline.named_steps.model.feature_importances_)
+        #pyplot.show()
+        
+        
+        if run_blind_test:
+           btD = {}
+           
+           # Build bts numbers dict
+           btD = {'n_blindY_neg'    : Counter(blind_test_target)[0]
+                  , 'n_blindY_pos'  : Counter(blind_test_target)[1]
+                  , 'n_testY_ratio' : round(Counter(blind_test_target)[0]/Counter(blind_test_target)[1], 2)
+                  , 'n_test_size'   : len(blind_test_df) }
+           
+           # Update cmD+tnD dicts with btD
+           mm_skf_scoresD[model_name].update(btD)
+                    
+           #--------------------------------------------------------
+           # Build the final results with all scores for the model
+           #--------------------------------------------------------
+           #bts_predict = gscv_fs.predict(blind_test_df)
+           model_pipeline.fit(input_df, target)
+           bts_predict = model_pipeline.predict(blind_test_df)
+           
+           bts_mcc_score = round(matthews_corrcoef(blind_test_target, bts_predict),2)
+           print('\nMCC on Blind test:'     , bts_mcc_score)
+           #print('\nAccuracy on Blind test:', round(accuracy_score(blind_test_target, bts_predict),2))
+           
+           mm_skf_scoresD[model_name]['bts_mcc']       = bts_mcc_score
+           mm_skf_scoresD[model_name]['bts_fscore']    = round(f1_score(blind_test_target, bts_predict),2)
+           mm_skf_scoresD[model_name]['bts_precision'] = round(precision_score(blind_test_target, bts_predict),2)
+           mm_skf_scoresD[model_name]['bts_recall']    = round(recall_score(blind_test_target, bts_predict),2)
+           mm_skf_scoresD[model_name]['bts_accuracy']  = round(accuracy_score(blind_test_target, bts_predict),2)
+           mm_skf_scoresD[model_name]['bts_roc_auc']   = round(roc_auc_score(blind_test_target, bts_predict),2)
+           mm_skf_scoresD[model_name]['bts_jcc']       = round(jaccard_score(blind_test_target, bts_predict),2)
+           
+    return(mm_skf_scoresD)
+#%%
+sel_cv = skf_cv
+# param dict for getmldata()
+combined_model_paramD = {'data_combined_model'   : False
+                    , 'use_or'                   : False
+                    , 'omit_all_genomic_features': False
+                    , 'write_maskfile'           : False
+                    , 'write_outfile'            : False }
+#df = getmldata(gene, drug, **combined_model_paramD)
+df = getmldata('pncA', 'pyrazinamide', **combined_model_paramD)
+
+df2 = split_tts(df
+          , data_type = 'actual'
+          , split_type = '80_20'
+          , oversampling = False
+          , dst_colname = 'dst'
+          , target_colname = 'dst_mode'
+          , include_gene_name = True
+          , random_state = 42 # default
+      )
+
+all(df2['X'].columns.isin(['gene_name']))
+
+
+fooD = XGBClf (input_df = df2['X']
+                , target = df2['y']
+                , sel_cv = skf_cv
+                , run_blind_test = True
+                , blind_test_df =  df2['X_bts']
+                , blind_test_target =  df2['y_bts']
+                , tts_split_type  = '80_20'
+                , var_type = 'mixed'
+                , resampling_type = 'none' # default
+)
+
+
+for k, v in fooD.items():
+    print('\nK:', k
+          , '\nTRAIN MCC:', fooD[k]['test_mcc']
+          ,  '\nBTS MCC:' , fooD[k]['bts_mcc'] )
+    
+#%%
+# # fit model no training data
+# model = XGBClassifier()
+# model.fit( df2['X'], df2['y'])
+# # feature importance
+# print(model.feature_importances_)
+# # plot
+# pyplot.bar(range(len(model.feature_importances_)), model.feature_importances_)
+# pyplot.show()
+