added ml_functions dir

2022-06-29 12:06:47 +01:00 · 2022-06-29 12:06:47 +01:00 · 9aadb0329f
commit 9aadb0329f
parent c85c965c3e
30 changed files with 683 additions and 606160 deletions
--- a/scripts/ml/ml_functions/FS.py
+++ b/scripts/ml/ml_functions/FS.py
@ -0,0 +1,394 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Mon May 23 23:25:26 2022
+
+@author: tanu
+"""
+#%%
+import os, sys
+import pandas as pd
+import numpy as np
+import pprint as pp
+from copy import deepcopy
+from sklearn import linear_model
+from sklearn import datasets
+from collections import Counter
+
+from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
+from sklearn.linear_model import RidgeClassifier, RidgeClassifierCV, SGDClassifier, PassiveAggressiveClassifier
+
+from sklearn.naive_bayes import BernoulliNB
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.svm import SVC
+from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
+from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier
+from sklearn.naive_bayes import GaussianNB
+from sklearn.gaussian_process import GaussianProcessClassifier, kernels
+from sklearn.gaussian_process.kernels import RBF, DotProduct, Matern, RationalQuadratic, WhiteKernel
+
+from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
+from sklearn.neural_network import MLPClassifier
+
+from sklearn.svm import SVC
+from xgboost import XGBClassifier
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
+
+from sklearn.compose import ColumnTransformer
+from sklearn.compose import make_column_transformer
+
+from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score
+from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report
+
+# added
+from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, LeaveOneOut, KFold, RepeatedKFold, cross_val_predict
+
+from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
+from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold
+
+from sklearn.pipeline import Pipeline, make_pipeline
+
+from sklearn.feature_selection import RFE, RFECV
+
+import itertools
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+from statistics import mean, stdev, median, mode
+
+from imblearn.over_sampling import RandomOverSampler
+from imblearn.under_sampling import RandomUnderSampler
+from imblearn.over_sampling import SMOTE
+from sklearn.datasets import make_classification
+from imblearn.combine import SMOTEENN
+from imblearn.combine import SMOTETomek
+
+from imblearn.over_sampling import SMOTENC
+from imblearn.under_sampling import EditedNearestNeighbours
+from imblearn.under_sampling import RepeatedEditedNearestNeighbours
+
+from sklearn.model_selection import GridSearchCV
+from sklearn.base import BaseEstimator
+from sklearn.impute import KNNImputer as KNN
+import json
+import argparse
+import re
+#####################################
+rs = {'random_state': 42}
+njobs = {'n_jobs': 10}
+
+scoring_fn =  ({ 'mcc'        : make_scorer(matthews_corrcoef)
+                , 'fscore'    : make_scorer(f1_score)
+                , 'precision' : make_scorer(precision_score)
+                , 'recall'    : make_scorer(recall_score)
+                , 'accuracy'  : make_scorer(accuracy_score)
+                , 'roc_auc'   : make_scorer(roc_auc_score)
+                , 'jcc'       : make_scorer(jaccard_score)
+            }) 
+  
+skf_cv = StratifiedKFold(n_splits = 10
+                          #, shuffle = False, random_state= None)
+                           , shuffle = True,**rs)
+
+rskf_cv = RepeatedStratifiedKFold(n_splits = 10
+                                  , n_repeats = 3
+                                  , **rs)
+
+mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)}
+jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
+
+###############################################################################
+def fsgs_rfecv(input_df
+         , target
+         , param_gridLd = [{'fs__min_features_to_select' : [1]}]
+         , blind_test_df = pd.DataFrame()
+         , blind_test_target = pd.Series(dtype = 'int64')
+         , estimator = LogisticRegression(**rs) # placeholder
+         , use_fs = False # uses estimator as the RFECV parameter for fs. Set to TRUE if you want to supply custom_fs as shown below
+         , custom_fs = RFECV(DecisionTreeClassifier(**rs) , cv =  skf_cv, scoring = 'matthews_corrcoef')
+         , cv_method =  skf_cv
+         , var_type = ['numerical', 'categorical' , 'mixed']
+         , verbose = 3
+         ):
+    '''
+    returns
+    Dict containing results from FS and hyperparam tuning for a given estiamtor
+    
+    >>> ADD MORE <<<
+    
+    optimised/selected based on mcc
+    
+    '''
+    ###########################################################################
+    #================================================
+    # Determine categorical and numerical features
+    #================================================
+    numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
+    numerical_ix
+    categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns
+    categorical_ix    
+    
+    #================================================
+    # Determine preprocessing steps ~ var_type
+    #================================================
+    if var_type == 'numerical':
+        t = [('num', MinMaxScaler(), numerical_ix)]
+    
+    if var_type == 'categorical':
+        t = [('cat', OneHotEncoder(), categorical_ix)]
+    
+    if var_type == 'mixed':
+        t = [('cat', OneHotEncoder(), categorical_ix)
+              , ('num', MinMaxScaler(), numerical_ix)]
+        
+    col_transform = ColumnTransformer(transformers = t
+                                        , remainder='passthrough')
+    
+    ###########################################################################
+    #==================================================
+    # Create var_type ~ column names
+    # using one hot encoder with RFECV means 
+    # the names internally are lost. Hence
+    # fit col_transformeer to my input_df and get 
+    # all the column names out and stored in a var
+    # to allow the 'selected features' to be subsetted
+    # from the numpy boolean array
+    #=================================================
+    col_transform.fit(input_df)
+    col_transform.get_feature_names_out()
+    
+    var_type_colnames = col_transform.get_feature_names_out()
+    var_type_colnames = pd.Index(var_type_colnames)
+    
+    if var_type == 'mixed':
+        print('\nVariable type is:', var_type
+              , '\nNo. of columns in input_df:', len(input_df.columns)
+              , '\nNo. of columns post one hot encoder:', len(var_type_colnames))
+    else:
+        print('\nNo. of columns in input_df:', len(input_df.columns))
+        
+    #==================================
+    # Build FS with supplied estimator
+    #==================================
+    if use_fs:
+        fs = custom_fs
+    else:
+        fs = RFECV(estimator, cv = skf_cv, scoring = 'matthews_corrcoef')
+    
+    #==================================
+    # Build basic param grid
+    #==================================
+    # param_gridD = [
+    #     {'fs__min_features_to_select' : [1] 
+    #       }]
+        
+    ############################################################################   
+    # Create Pipeline object
+    pipe = Pipeline([
+        ('pre', col_transform),
+        ('fs', fs),
+        ('clf', estimator)])
+    ############################################################################   
+    # Define GridSearchCV
+    gscv_fs = GridSearchCV(pipe
+                           #, param_gridLd = param_gridD
+                           , param_gridLd
+                           , cv = cv_method
+                           , scoring = scoring_fn
+                           , refit = 'mcc'
+                           , verbose = 3
+                           , return_train_score = True
+                           , **njobs)
+    
+    gscv_fs.fit(input_df, target)
+    
+    ###########################################################################
+    # Get best param and scores out
+    gscv_fs.best_params_
+    gscv_fs.best_score_
+    
+    # Training best score corresponds to the max of the mean_test<score>
+    train_bscore = round(gscv_fs.best_score_, 2); train_bscore
+    print('\nTraining best score (MCC):', train_bscore)
+    gscv_fs.cv_results_['mean_test_mcc']
+    round(gscv_fs.cv_results_['mean_test_mcc'].max(),2)
+    round(np.nanmax(gscv_fs.cv_results_['mean_test_mcc']),2)
+    
+    check_train_score = [round(gscv_fs.cv_results_['mean_test_mcc'].max(),2)
+                        , round(np.nanmax(gscv_fs.cv_results_['mean_test_mcc']),2)]
+    
+    check_train_score = np.nanmax(check_train_score)
+    
+    # Training results
+    gscv_tr_resD = gscv_fs.cv_results_
+    mod_refit_param =  gscv_fs.refit
+    
+    # sanity check
+    if train_bscore == check_train_score:
+        print('\nVerified training score (MCC):', train_bscore )
+    else:
+        sys.exit('\nTraining score could not be internatlly verified. Please check training results dict')
+        
+    #-------------------------
+    # Dict of CV results
+    #-------------------------
+    cv_allD = gscv_fs.cv_results_
+    cvdf0   = pd.DataFrame(cv_allD)
+    cvdf    = cvdf0.filter(regex='mean_test', axis = 1)
+    cvdfT   = cvdf.T
+    cvdfT.columns = ['cv_score']
+    cvdfTr = cvdfT.loc[:,'cv_score'].round(decimals = 2) # round values
+    cvD     = cvdfTr.to_dict()
+    print('\n CV results dict generated for:', len(scoring_fn), 'scores'
+          , '\nThese are:', scoring_fn.keys())
+        
+    #-------------------------
+    # Blind test: REAL check!
+    #-------------------------
+    #tp = gscv_fs.predict(X_bts)
+    tp = gscv_fs.predict(blind_test_df)
+
+    print('\nMCC on Blind test:'     , round(matthews_corrcoef(blind_test_target, tp),2))
+    print('\nAccuracy on Blind test:', round(accuracy_score(blind_test_target, tp),2))
+    
+    #=================
+    # info extraction
+    #=================
+    # gives input vals??
+    gscv_fs._check_n_features
+    
+    # gives gscv params used
+    gscv_fs._get_param_names()
+    
+    # gives ??
+    gscv_fs.best_estimator_
+    gscv_fs.best_params_ # gives best estimator params as a dict 
+    gscv_fs.best_estimator_._final_estimator # similar to above, doesn't contain max_iter
+    gscv_fs.best_estimator_.named_steps['fs'].get_support()
+    gscv_fs.best_estimator_.named_steps['fs'].ranking_ # array of ranks for the features
+    
+    gscv_fs.best_estimator_.named_steps['fs'].grid_scores_.mean()
+    gscv_fs.best_estimator_.named_steps['fs'].grid_scores_.max()
+    #gscv_fs.best_estimator_.named_steps['fs'].grid_scores_
+    
+    estimator_mask = gscv_fs.best_estimator_.named_steps['fs'].get_support()
+
+    
+    ############################################################################
+    #============
+    # FS results
+    #============
+    # Now get the features out
+    
+    #--------------
+    # All features 
+    #--------------
+    all_features = gscv_fs.feature_names_in_
+    n_all_features =  gscv_fs.n_features_in_
+    #all_features = gsfit.feature_names_in_
+    
+    #--------------
+    # Selected features by the classifier
+    # Important to have var_type_colnames here
+    #----------------
+    #sel_features = X.columns[gscv_fs.best_estimator_.named_steps['fs'].get_support()] 3 only for numerical df    
+    sel_features = var_type_colnames[gscv_fs.best_estimator_.named_steps['fs'].get_support()]
+    n_sf = gscv_fs.best_estimator_.named_steps['fs'].n_features_
+    
+    #--------------
+    # Get model name
+    #--------------
+    model_name     = gscv_fs.best_estimator_.named_steps['clf']
+    b_model_params = gscv_fs.best_params_
+    
+    print('\n========================================'
+          , '\nRunning model:'
+          , '\nModel name:', model_name
+          , '\n==============================================='
+          , '\nRunning feature selection with RFECV for model'
+          , '\nTotal no. of features in model:', len(all_features)
+          , '\nThese are:\n',  all_features, '\n\n'
+          , '\nNo of features for best model: ', n_sf
+          , '\nThese are:', sel_features, '\n\n'
+          , '\nBest Model hyperparams:', b_model_params
+          )
+    
+    ###########################################################################
+    ############################## OUTPUT #####################################
+    ###########################################################################
+    #=========================
+    # Blind test: BTS results
+    #=========================
+    # Build the final results with all scores for a feature selected model
+    #bts_predict = gscv_fs.predict(X_bts)
+    bts_predict = gscv_fs.predict(blind_test_df)
+
+    print('\nMCC on Blind test:'     , round(matthews_corrcoef(blind_test_target, bts_predict),2))
+    print('\nAccuracy on Blind test:', round(accuracy_score(blind_test_target, bts_predict),2))
+    bts_mcc_score = round(matthews_corrcoef(blind_test_target, bts_predict),2)
+    
+    # Diff b/w train and bts test scores
+    train_test_diff = train_bscore - bts_mcc_score
+    print('\nDiff b/w train and blind test score (MCC):', train_test_diff)
+    
+    lr_btsD ={}
+    #lr_btsD['bts_mcc']       = bts_mcc_score
+    lr_btsD['bts_fscore']    = round(f1_score(blind_test_target, bts_predict),2)
+    lr_btsD['bts_precision'] = round(precision_score(blind_test_target, bts_predict),2)
+    lr_btsD['bts_recall']    = round(recall_score(blind_test_target, bts_predict),2)
+    lr_btsD['bts_accuracy']  = round(accuracy_score(blind_test_target, bts_predict),2)
+    lr_btsD['bts_roc_auc']   = round(roc_auc_score(blind_test_target, bts_predict),2)
+    lr_btsD['bts_jcc']       = round(jaccard_score(blind_test_target, bts_predict),2)
+    lr_btsD
+    
+    #===========================
+    # Add FS related model info
+    #===========================
+    model_namef = str(model_name)
+    # FIXME: doesn't tell you which it has chosen
+    fs_methodf = str(gscv_fs.best_estimator_.named_steps['fs'])
+    all_featuresL = list(all_features)
+    fs_res_arrayf = str(list( gscv_fs.best_estimator_.named_steps['fs'].get_support()))
+    fs_res_array_rankf = str(list( gscv_fs.best_estimator_.named_steps['fs'].ranking_))
+    sel_featuresf = list(sel_features)
+    n_sf = int(n_sf)
+    
+    output_modelD = {'model_name': model_namef
+                     , 'model_refit_param': mod_refit_param
+                     , 'Best_model_params': b_model_params 
+                     , 'n_all_features': n_all_features
+                     , 'fs_method': fs_methodf
+                     , 'fs_res_array': fs_res_arrayf
+                     , 'fs_res_array_rank': fs_res_array_rankf
+                     , 'all_feature_names': all_featuresL
+                     , 'n_sel_features': n_sf
+                     , 'sel_features_names': sel_featuresf}
+    #output_modelD
+    
+    #========================================
+    # Update output_modelD with bts_results
+    #========================================
+    output_modelD.update(lr_btsD)
+    output_modelD
+    
+    output_modelD['train_score (MCC)'] = train_bscore
+    output_modelD['bts_mcc'] = bts_mcc_score
+    output_modelD['train_bts_diff'] = round(train_test_diff,2)
+    print(output_modelD)
+    
+    nlen = len(output_modelD)
+    
+    #========================================
+    # Update output_modelD with cv_results
+    #========================================
+    output_modelD.update(cvD)
+    
+    if (len(output_modelD) == nlen + len(cvD)):
+        print('\nFS run complete for model:', estimator
+              , '\nFS using:', fs
+              , '\nOutput dict size:', len(output_modelD))
+        return(output_modelD)
+    else:
+        sys.exit('\nFAIL:numbers mismatch output dict length not as expected. Please check')
+    
--- a/scripts/ml/ml_functions/GetMLData.py
+++ b/scripts/ml/ml_functions/GetMLData.py
@ -0,0 +1,646 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Sun Mar  6 13:41:54 2022
+
+@author: tanu
+"""
+
+#https://stackoverflow.com/questions/51695322/compare-multiple-algorithms-with-sklearn-pipeline
+import os, sys
+import pandas as pd
+import numpy as np
+print(np.__version__)
+print(pd.__version__)
+import pprint as pp
+from copy import deepcopy
+from collections import Counter
+from sklearn.impute import KNNImputer as KNN
+from imblearn.over_sampling import RandomOverSampler
+from imblearn.under_sampling import RandomUnderSampler
+from imblearn.over_sampling import SMOTE
+from sklearn.datasets import make_classification
+from imblearn.combine import SMOTEENN
+from imblearn.combine import SMOTETomek
+
+from imblearn.over_sampling import SMOTENC
+from imblearn.under_sampling import EditedNearestNeighbours
+from imblearn.under_sampling import RepeatedEditedNearestNeighbours
+
+from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score
+from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report
+
+from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
+from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold
+
+from sklearn.pipeline import Pipeline, make_pipeline
+import argparse
+import re
+
+
+def getmldata(gene, drug
+              , data_combined_model = False
+              , use_or = False
+              , omit_all_genomic_features = False
+              , write_maskfile = False
+              , write_outfile = False):
+
+    #%% FOR LATER: Combine ED logo data
+    #%% constructuing genomic feature group
+    #========================
+    # FG: Genomic features
+    #========================
+    X_gn_maf_Fnum =  ['maf']
+    #X_gn_or_Fnum =  ['logorI', 'or_rawI', 'or_mychisq', 'or_logistic', 'or_fisher', 'pval_fisher']
+    
+    X_gn_linegae_Fnum  = ['lineage_proportion'
+                          , 'dist_lineage_proportion'
+                          #, 'lineage' # could be included as a category but it has L2;L4  formatting
+                          , 'lineage_count_all'
+                          , 'lineage_count_unique']
+    
+    # X_gn_Fcat = ['drtype_mode_labels'  # beware then you can't use it to predict [USED it for uq_v1, not v2]
+    #                #, 'gene_name'] # will be required for the combined stuff
+    #X_gn_Fcat = []
+    
+    if data_combined_model:
+        X_geneF = ['gene_name']
+    else:
+        X_geneF = []
+
+    if use_or:
+        X_gn_or_Fnum =  ['logorI']
+    else:
+        X_gn_or_Fnum = []
+    
+    if omit_all_genomic_features:
+        print('\nOmitting all genomic features (n):', len(X_gn_maf_Fnum) + len(X_gn_or_Fnum) + len(X_gn_linegae_Fnum) + len(X_geneF))
+        X_genomicFN = []
+        if use_or:
+           sys.exit('\nError: omitting genomic feature and using odds ratio are mutually exclusive')
+    else:
+        X_genomicFN = X_gn_maf_Fnum + X_gn_or_Fnum + X_gn_linegae_Fnum + X_geneF
+       
+    #%%
+    ###########################################################################
+
+    homedir = os.path.expanduser("~")
+    
+    geneL_basic     = ['pnca']
+    geneL_na        = ['gid']
+    geneL_na_ppi2   = ['rpob']
+    geneL_ppi2      = ['alr', 'embb', 'katg']
+    
+    #num_type = ['int64', 'float64']
+    num_type = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
+    cat_type = ['object', 'bool']
+    
+    #==============
+    # directories
+    #==============
+    datadir = homedir + '/git/Data/'
+    indir   = datadir + drug + '/input/'
+    outdir  = datadir + drug + '/output/'
+    outdir_ml = outdir + 'ml/'
+      
+    #==========================
+    # outfile for ML training:
+    #==========================
+    outFile_ml = outdir_ml + gene.lower() + '_training_data.csv'
+   
+    outFile_mask_ml = outdir_ml + gene.lower() + '_mask_check.csv'
+    
+    #=======
+    # input
+    #=======
+    
+    #---------
+    # File 1
+    #---------
+    infile_ml1 = outdir + gene.lower() + '_merged_df3.csv' 
+    #infile_ml2 = outdir + gene.lower() + '_merged_df2.csv'
+    
+    my_features_df = pd.read_csv(infile_ml1, index_col = 0) 
+    my_features_df  = my_features_df .reset_index(drop = True)
+    my_features_df.index
+    
+    my_features_df.dtypes
+    mycols = my_features_df.columns
+    
+    #---------
+    # File 2
+    #---------
+    infile_aaindex = outdir + 'aa_index/' + gene.lower() + '_aa.csv' 
+    aaindex_df = pd.read_csv(infile_aaindex, index_col = 0) 
+    aaindex_df.dtypes
+    
+    #-----------
+    # check for non-numerical columns
+    #-----------
+    if any(aaindex_df.dtypes==object):
+        print('\naaindex_df contains non-numerical data')
+    
+    aaindex_df_object = aaindex_df.select_dtypes(include = cat_type)
+    print('\nTotal no. of non-numerial columns:', len(aaindex_df_object.columns))
+    
+    expected_aa_ncols = len(aaindex_df.columns) - len(aaindex_df_object.columns)
+    
+    #-----------
+    # Extract numerical data only
+    #-----------
+    print('\nSelecting numerical data only')
+    aaindex_df = aaindex_df.select_dtypes(include = num_type)
+    
+    #---------------------------
+    # aaindex: sanity check 1
+    #---------------------------
+    if len(aaindex_df.columns) == expected_aa_ncols:
+        print('\nPASS: successfully selected numerical columns only for aaindex_df')
+    else:
+        print('\nFAIL: Numbers mismatch'
+              , '\nExpected ncols:', expected_aa_ncols
+              , '\nGot:', len(aaindex_df.columns))    
+        
+    #---------------
+    # check for NA
+    #---------------
+    print('\nNow checking for NA in the remaining aaindex_cols')
+    c1 = aaindex_df.isna().sum()
+    c2 = c1.sort_values(ascending=False)
+    print('\nCounting aaindex_df cols with NA'
+          , '\nncols with NA:', sum(c2>0), 'columns'
+          , '\nDropping these...'
+          , '\nOriginal ncols:', len(aaindex_df.columns)
+          )
+    aa_df = aaindex_df.dropna(axis=1)
+    
+    print('\nRevised df ncols:', len(aa_df.columns))
+    
+    c3 = aa_df.isna().sum()
+    c4 = c3.sort_values(ascending=False)
+    
+    print('\nChecking NA in revised df...')
+    
+    if sum(c4>0):
+        sys.exit('\nFAIL: aaindex_df still contains cols with NA, please check and drop these before proceeding...')
+    else:
+        print('\nPASS: cols with NA successfully dropped from aaindex_df'
+              , '\nProceeding with combining aa_df with other features_df')
+        
+    #---------------------------
+    # aaindex: sanity check 2
+    #---------------------------
+    expected_aa_ncols2 =  len(aaindex_df.columns) - sum(c2>0)  
+    if len(aa_df.columns) == expected_aa_ncols2:
+        print('\nPASS: ncols match'
+              , '\nExpected ncols:', expected_aa_ncols2
+              , '\nGot:', len(aa_df.columns))
+    else:
+        print('\nFAIL: Numbers mismatch'
+              , '\nExpected ncols:', expected_aa_ncols2
+              , '\nGot:', len(aa_df.columns))            
+        
+    # Important: need this to identify aaindex cols    
+    aa_df_cols = aa_df.columns
+    print('\nTotal no. of columns in clean aa_df:', len(aa_df_cols))
+    
+    ###############################################################################
+    #%% Combining my_features_df and aaindex_df
+    #===========================
+    # Merge my_df + aaindex_df
+    #===========================
+    
+    if aa_df.columns[aa_df.columns.isin(my_features_df.columns)] == my_features_df.columns[my_features_df.columns.isin(aa_df.columns)]:
+        print('\nMerging on column: mutationinformation')   
+    
+    if len(my_features_df) == len(aa_df):
+        expected_nrows = len(my_features_df)
+        print('\nProceeding to merge, expected nrows in merged_df:', expected_nrows)
+    else:
+        sys.exit('\nNrows mismatch, cannot merge. Please check'
+              , '\nnrows my_df:', len(my_features_df)
+              , '\nnrows aa_df:', len(aa_df))
+               
+    #-----------------
+    # Reset index: mutationinformation
+    # Very important for merging
+    #-----------------
+    aa_df = aa_df.reset_index()
+    
+    expected_ncols = len(my_features_df.columns) + len(aa_df.columns) - 1 # for the no. of merging col
+    
+    #-----------------
+    # Merge: my_features_df + aa_df
+    #-----------------
+    merged_df = pd.merge(my_features_df
+                         , aa_df
+                         , on = 'mutationinformation')
+    
+    #---------------------------
+    # aaindex: sanity check 3
+    #---------------------------
+    if len(merged_df.columns) == expected_ncols:
+        print('\nPASS: my_features_df and aa_df successfully combined'
+              , '\nnrows:', len(merged_df)
+              , '\nncols:', len(merged_df.columns))
+    else:
+        sys.exit('\nFAIL: could not combine my_features_df and aa_df'
+                 , '\nCheck dims and merging cols!')
+        
+    #--------
+    # Reassign so downstream code doesn't need to change
+    #--------
+    my_df = merged_df.copy()
+    
+    #%% Data: my_df
+    # Check if non structural pos have crept in
+    # IDEALLY remove from source! But for rpoB do it here
+    # Drop NA where numerical cols have them
+    if gene.lower() in geneL_na_ppi2:
+        #D1148 get rid of
+        na_index = my_df['mutationinformation'].index[my_df['mcsm_na_affinity'].apply(np.isnan)]
+        my_df = my_df.drop(index=na_index)
+    
+    ###########################################################################
+    #%% Add lineage calculation columns
+    #FIXME: Check if this can be imported from config?
+    total_mtblineage_uc = 8
+    lineage_colnames = ['lineage_list_all', 'lineage_count_all', 'lineage_count_unique', 'lineage_list_unique', 'lineage_multimode']
+    #bar = my_df[lineage_colnames]
+    my_df['lineage_proportion']      = my_df['lineage_count_unique']/my_df['lineage_count_all']
+    my_df['dist_lineage_proportion'] = my_df['lineage_count_unique']/total_mtblineage_uc
+    ###########################################################################
+    #%% Active site annotation column
+    # change from numberic to categorical
+    
+    if my_df['active_site'].dtype in num_type:
+        my_df['active_site'] = my_df['active_site'].astype(object)
+        my_df['active_site'].dtype
+    #%% AA property change
+    #--------------------
+    # Water prop change
+    #--------------------
+    my_df['water_change'] = my_df['wt_prop_water'] + str('_to_') + my_df['mut_prop_water']
+    my_df['water_change'].value_counts()
+    
+    water_prop_changeD = {
+        'hydrophobic_to_neutral'          : 'change'
+        , 'hydrophobic_to_hydrophobic'    : 'no_change'
+        , 'neutral_to_neutral'            : 'no_change'
+        , 'neutral_to_hydrophobic'        : 'change'
+        , 'hydrophobic_to_hydrophilic'    : 'change'
+        , 'neutral_to_hydrophilic'        : 'change'
+        , 'hydrophilic_to_neutral'        : 'change'
+        , 'hydrophilic_to_hydrophobic'    : 'change'
+        , 'hydrophilic_to_hydrophilic'    : 'no_change'
+    }
+    
+    my_df['water_change'] = my_df['water_change'].map(water_prop_changeD)
+    my_df['water_change'].value_counts()
+    
+    #--------------------
+    # Polarity change
+    #--------------------
+    my_df['polarity_change'] = my_df['wt_prop_polarity'] + str('_to_') + my_df['mut_prop_polarity']
+    my_df['polarity_change'].value_counts()
+    
+    polarity_prop_changeD = {
+        'non-polar_to_non-polar'     : 'no_change'
+        , 'non-polar_to_neutral'     : 'change'  
+        , 'neutral_to_non-polar'     : 'change'  
+        , 'neutral_to_neutral'       : 'no_change'  
+        , 'non-polar_to_basic'       : 'change'  
+        , 'acidic_to_neutral'        : 'change'  
+        , 'basic_to_neutral'         : 'change'  
+        , 'non-polar_to_acidic'      : 'change'  
+        , 'neutral_to_basic'         : 'change'  
+        , 'acidic_to_non-polar'      : 'change'  
+        , 'basic_to_non-polar'       : 'change'
+        , 'neutral_to_acidic'        : 'change'
+        , 'acidic_to_acidic'         : 'no_change'
+        , 'basic_to_acidic'          : 'change'
+        , 'basic_to_basic'           : 'no_change'
+        , 'acidic_to_basic'          : 'change'}
+    
+    my_df['polarity_change'] = my_df['polarity_change'].map(polarity_prop_changeD)
+    my_df['polarity_change'].value_counts()
+    
+    #--------------------
+    # Electrostatics change
+    #--------------------
+    my_df['electrostatics_change'] = my_df['wt_calcprop'] + str('_to_') + my_df['mut_calcprop']
+    my_df['electrostatics_change'].value_counts()
+    
+    calc_prop_changeD = {
+            'non-polar_to_non-polar'     : 'no_change'
+            , 'non-polar_to_polar'       : 'change'
+            , 'polar_to_non-polar'       : 'change'
+            , 'non-polar_to_pos'         : 'change'
+            , 'neg_to_non-polar'         : 'change'
+            , 'non-polar_to_neg'         : 'change'
+            , 'pos_to_polar'             : 'change'
+            , 'pos_to_non-polar'         : 'change'
+            , 'polar_to_polar'           : 'no_change'
+            , 'neg_to_neg'               : 'no_change'
+            , 'polar_to_neg'             : 'change'
+            , 'pos_to_neg'               : 'change'
+            , 'pos_to_pos'               : 'no_change'
+            , 'polar_to_pos'             : 'change'
+            , 'neg_to_polar'             : 'change'
+            , 'neg_to_pos'               : 'change'
+    }
+    
+    my_df['electrostatics_change'] = my_df['electrostatics_change'].map(calc_prop_changeD)
+    my_df['electrostatics_change'].value_counts()
+    
+    #--------------------    
+    # Summary change: Create a combined column summarising these three cols
+    #--------------------
+    detect_change = 'change'
+    check_prop_cols = ['water_change', 'polarity_change', 'electrostatics_change']
+    #my_df['aa_prop_change'] = (my_df.values == detect_change).any(1).astype(int)
+    my_df['aa_prop_change'] = (my_df[check_prop_cols].values == detect_change).any(1).astype(int)
+    my_df['aa_prop_change'].value_counts()
+    my_df['aa_prop_change'].dtype
+    
+    my_df['aa_prop_change'] = my_df['aa_prop_change'].map({1:'change'
+                                                           , 0: 'no_change'})
+    
+    my_df['aa_prop_change'].value_counts()
+    my_df['aa_prop_change'].dtype
+    
+    #%% IMPUTE values for OR [check script for exploration: UQ_or_imputer]
+    #--------------------
+    # Impute OR values
+    #--------------------
+    #or_cols = ['or_mychisq', 'log10_or_mychisq', 'or_fisher']
+    sel_cols = ['mutationinformation', 'or_mychisq', 'log10_or_mychisq']
+    or_cols = ['or_mychisq', 'log10_or_mychisq']
+    
+    print("count of NULL values before imputation\n")
+    print(my_df[or_cols].isnull().sum())
+    
+    my_dfI = pd.DataFrame(index = my_df['mutationinformation'] )
+    
+        
+    my_dfI = pd.DataFrame(KNN(n_neighbors=3, weights="uniform").fit_transform(my_df[or_cols])
+                          , index =  my_df['mutationinformation']
+                          , columns = or_cols )
+    my_dfI.columns = ['or_rawI', 'logorI']
+    my_dfI.columns
+    my_dfI = my_dfI.reset_index(drop = False) # prevents old index from being added as a column
+    my_dfI.head()
+    print("count of NULL values AFTER imputation\n")
+    print(my_dfI.isnull().sum())
+    
+    #-------------------------------------------
+    # OR df Merge: with original based on index
+    #-------------------------------------------
+    #my_df['index_bm'] = my_df.index
+    mydf_imputed = pd.merge(my_df
+                        , my_dfI
+                        , on = 'mutationinformation')
+    #mydf_imputed = mydf_imputed.set_index(['index_bm'])
+    
+    my_df['log10_or_mychisq'].isna().sum()
+    mydf_imputed['log10_or_mychisq'].isna().sum()
+    mydf_imputed['logorI'].isna().sum() # should be 0
+    
+    len(my_df.columns)
+    len(mydf_imputed.columns)  
+    
+    #-----------------------------------------
+    # REASSIGN my_df after imputing OR values
+    #-----------------------------------------
+    my_df = mydf_imputed.copy()
+    
+    if my_df['logorI'].isna().sum() == 0:
+        print('\nPASS: OR values imputed, data ready for ML')
+    else:
+        sys.exit('\nFAIL: something went wrong, Data not ready for ML. Please check upstream!')
+    
+    #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+    #---------------------------------------
+    # TODO: try other imputation like MICE
+    #---------------------------------------
+    #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+    
+    #%%########################################################################
+    #==========================
+    #     Data for ML
+    #==========================
+    my_df_ml = my_df.copy()
+    
+    # Build column names to mask for affinity chanhes
+    if gene.lower() in geneL_basic:
+        #X_stabilityN = common_cols_stabiltyN
+        gene_affinity_colnames = []# not needed as its the common ones 
+        cols_to_mask = ['ligand_affinity_change']
+        
+    if gene.lower() in geneL_ppi2:
+        gene_affinity_colnames = ['mcsm_ppi2_affinity', 'interface_dist'] 
+        #X_stabilityN = common_cols_stabiltyN + geneL_ppi2_st_cols
+        cols_to_mask = ['ligand_affinity_change', 'mcsm_ppi2_affinity']
+    
+    if gene.lower() in geneL_na:
+        gene_affinity_colnames =  ['mcsm_na_affinity'] 
+        #X_stabilityN = common_cols_stabiltyN + geneL_na_st_cols
+        cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity']
+    
+    if gene.lower() in geneL_na_ppi2:
+        gene_affinity_colnames = ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] 
+        #X_stabilityN = common_cols_stabiltyN + geneL_na_ppi2_st_cols
+        cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity', 'mcsm_ppi2_affinity']
+    
+    #=======================
+    # Masking columns:
+    # (mCSM-lig, mCSM-NA, mCSM-ppi2) values for lig_dist >10
+    #=======================
+    my_df_ml['mutationinformation'][my_df_ml['ligand_distance']>10].value_counts()
+    my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts()
+    my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask].value_counts()
+    
+    # mask the mcsm affinity related columns where ligand distance > 10
+    my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask] = 0
+    (my_df_ml['ligand_affinity_change'] == 0).sum()
+    
+    mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask]  
+    
+    #===================================================
+    # write file for check
+    #mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
+    #mask_check.to_csv(outdir + 'ml/' + gene.lower() + '_mask_check.csv')
+    #===================================================
+    ###############################################################################
+    #%% Feature groups (FG): Build X for Input ML 
+    ############################################################################
+    #===========================
+    # FG1: Evolutionary features
+    #===========================
+    X_evolFN =  ['consurf_score'
+               , 'snap2_score'
+               , 'provean_score']
+    
+    ###############################################################################
+    #========================
+    # FG2: Stability features
+    #========================
+    #--------
+    # common
+    #--------
+    X_common_stability_Fnum = [
+               'duet_stability_change'
+               , 'ddg_foldx'
+               , 'deepddg'
+               , 'ddg_dynamut2'
+               , 'contacts']
+    #--------
+    # FoldX
+    #--------
+    X_foldX_Fnum = [ 'electro_rr', 'electro_mm', 'electro_sm', 'electro_ss'
+    , 'disulfide_rr', 'disulfide_mm', 'disulfide_sm', 'disulfide_ss'
+    , 'hbonds_rr', 'hbonds_mm', 'hbonds_sm', 'hbonds_ss'
+    , 'partcov_rr', 'partcov_mm', 'partcov_sm', 'partcov_ss'
+    , 'vdwclashes_rr', 'vdwclashes_mm', 'vdwclashes_sm', 'vdwclashes_ss'
+    , 'volumetric_rr', 'volumetric_mm', 'volumetric_ss']
+    
+    X_stability_FN = X_common_stability_Fnum + X_foldX_Fnum
+    
+    ###############################################################################
+    #===================
+    # FG3: Affinity features
+    #===================
+    common_affinity_Fnum =  ['ligand_distance'
+                    , 'ligand_affinity_change'
+                    , 'mmcsm_lig']
+    
+    # if gene.lower() in geneL_basic:
+    #     X_affinityFN = common_affinity_Fnum 
+    # else:
+    #     X_affinityFN = common_affinity_Fnum + gene_affinity_colnames
+        
+    X_affinityFN = common_affinity_Fnum + gene_affinity_colnames
+    
+    ###############################################################################
+    #============================
+    # FG4: Residue level features
+    #============================
+    #-----------
+    # AA index
+    #-----------
+    X_aaindex_Fnum = list(aa_df_cols)
+    print('\nTotal no. of features for aaindex:', len(X_aaindex_Fnum))
+    
+    #-----------------
+    # surface area
+    # depth
+    # hydrophobicity
+    #-----------------
+    X_str_Fnum =  ['rsa'
+               #, 'asa'
+               , 'kd_values'
+               , 'rd_values']   
+    
+    #---------------------------
+    # Other aa properties
+    # active site indication
+    #---------------------------
+    X_aap_Fcat = ['ss_class'
+                # , 'wt_prop_water'
+                # , 'mut_prop_water'
+                # , 'wt_prop_polarity'
+                # , 'mut_prop_polarity'
+                # , 'wt_calcprop'
+                # , 'mut_calcprop'
+                , 'aa_prop_change'
+                , 'electrostatics_change'
+                , 'polarity_change'
+                , 'water_change'
+                , 'active_site']
+       
+    X_resprop_FN = X_aaindex_Fnum + X_str_Fnum + X_aap_Fcat
+    ###############################################################################
+    #========================
+    # FG5: Genomic features
+    #========================
+    # See the beginnning section
+    if use_or:
+        print('\nALL Genomic features being used (n):', len(X_genomicFN)
+          , '\nThese are:', X_genomicFN)
+    else:
+        print('\nGenomic features being used EXCLUDING odds ratio (n):', len(X_genomicFN)
+          , '\nThese are:', X_genomicFN)
+   
+    ###############################################################################
+    #========================
+    # FG6 collapsed: Structural : Atability + Affinity + ResidueProp
+    #========================
+    X_structural_FN =  X_stability_FN + X_affinityFN + X_resprop_FN
+    
+    ###############################################################################
+    #========================
+    # BUILDING all features
+    #========================
+    all_featuresN = X_evolFN + X_structural_FN + X_genomicFN
+    
+    ###############################################################################
+    #%% Define training and test data
+    #================================================================
+    # Training and BLIND test set: 70/30
+    # dst with actual values  : training set
+    # dst with imputed values : THROW AWAY [unrepresentative]
+    #================================================================
+    my_df_ml[drug].isna().sum()
+    
+    #    blind_test_df = my_df_ml[my_df_ml[drug].isna()]
+    #    blind_test_df.shape
+    
+    # training_df = my_df_ml[my_df_ml[drug].notna()]
+    # training_df.shape
+    
+    # training_df = my_df_ml.copy()
+     
+    # # Target 1: dst_mode
+    # training_df[drug].value_counts()
+    # training_df['dst_mode'].value_counts()
+    
+    #all_training_df = my_df_ml[all_featuresN]
+
+    # Getting the dst column as this will be required for tts_split()    
+    if 'dst' in my_df_ml:
+        print('\ndst column exists')
+        if my_df_ml['dst'].equals(my_df_ml[drug]):
+            print('\nand this is identical to drug column:', drug)
+    
+        all_featuresN2 = all_featuresN + ['dst', 'dst_mode']
+        all_training_df = my_df_ml[all_featuresN2]
+        
+    print('\nAll feature names:', all_featuresN2)
+    ####################################################################
+
+    #==========================================================================
+    if write_maskfile:
+        print('\nPASS: and now writing file to check masked columns and values:', outFile_mask_ml )
+        mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
+        mask_check.to_csv(outFile_mask_ml, index = False)
+    else:
+        print('\nPASS: but NOT writing mask file')
+    #==========================================================================
+    if write_outfile:
+        print('\nPASS: and now writing processed file for ml:', outFile_ml)
+        #all_training_df.to_csv(outFile_ml, index = False)
+    else:
+        print('\nPASS: But NOT writing processed file')
+    #==========================================================================
+    
+    print('\n#################################################################'
+          , '\nSUCCESS: Extacted training data for gene:', gene.lower()
+          , '\nDim of training_df:', all_training_df.shape)
+    if use_or:
+        print('\nThis includes Odds Ratio'
+              , '\n###########################################################')
+    else:
+        print('\nThis EXCLUDES Odds Ratio'
+              , '\n############################################################')
+ 
+    return(all_training_df)
--- a/scripts/ml/ml_functions/MultClfs.py
+++ b/scripts/ml/ml_functions/MultClfs.py
@ -0,0 +1,533 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Mar  4 15:25:33 2022
+
+@author: tanu
+"""
+#%%
+import os, sys
+import pandas as pd
+import numpy as np
+import pprint as pp
+from copy import deepcopy
+from sklearn import linear_model
+from sklearn import datasets
+from collections import Counter
+
+from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
+from sklearn.linear_model import RidgeClassifier, RidgeClassifierCV, SGDClassifier, PassiveAggressiveClassifier
+
+from sklearn.naive_bayes import BernoulliNB
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.svm import SVC
+from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
+from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier
+from sklearn.naive_bayes import GaussianNB
+from sklearn.gaussian_process import GaussianProcessClassifier, kernels
+from sklearn.gaussian_process.kernels import RBF, DotProduct, Matern, RationalQuadratic, WhiteKernel
+
+from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
+from sklearn.neural_network import MLPClassifier
+
+from sklearn.svm import SVC
+from xgboost import XGBClassifier
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
+
+from sklearn.compose import ColumnTransformer
+from sklearn.compose import make_column_transformer
+
+from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score
+from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report
+
+# added
+from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, LeaveOneOut, KFold, RepeatedKFold, cross_val_predict
+
+from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
+from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold
+
+from sklearn.pipeline import Pipeline, make_pipeline
+
+from sklearn.feature_selection import RFE, RFECV
+
+import itertools
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+from statistics import mean, stdev, median, mode
+
+from imblearn.over_sampling import RandomOverSampler
+from imblearn.under_sampling import RandomUnderSampler
+from imblearn.over_sampling import SMOTE
+from sklearn.datasets import make_classification
+from imblearn.combine import SMOTEENN
+from imblearn.combine import SMOTETomek
+
+from imblearn.over_sampling import SMOTENC
+from imblearn.under_sampling import EditedNearestNeighbours
+from imblearn.under_sampling import RepeatedEditedNearestNeighbours
+
+from sklearn.model_selection import GridSearchCV
+from sklearn.base import BaseEstimator
+from sklearn.impute import KNNImputer as KNN
+import json
+import argparse
+import re
+#%% GLOBALS
+rs = {'random_state': 42}
+njobs = {'n_jobs': 10}
+
+scoring_fn =  ({ 'mcc'        : make_scorer(matthews_corrcoef)
+                , 'fscore'    : make_scorer(f1_score)
+                , 'precision' : make_scorer(precision_score)
+                , 'recall'    : make_scorer(recall_score)
+                , 'accuracy'  : make_scorer(accuracy_score)
+                , 'roc_auc'   : make_scorer(roc_auc_score)
+                , 'jcc'       : make_scorer(jaccard_score)
+            }) 
+  
+skf_cv = StratifiedKFold(n_splits = 10
+                          #, shuffle = False, random_state= None)
+                           , shuffle = True,**rs)
+
+rskf_cv = RepeatedStratifiedKFold(n_splits = 10
+                                  , n_repeats = 3
+                                  , **rs)
+
+mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)}
+jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
+
+###############################################################################
+score_type_ordermapD = { 'mcc'      : 1
+                   , 'fscore'       : 2
+                   , 'jcc'          : 3
+                   , 'precision'    : 4
+                   , 'recall'       : 5      
+                   , 'accuracy'     : 6  
+                   , 'roc_auc'      : 7
+                   , 'TN'           : 8
+                   , 'FP'           : 9
+                   , 'FN'           : 10
+                   , 'TP'           : 11  
+                   , 'trainingY_neg': 12  
+                   , 'trainingY_pos': 13    
+                   , 'blindY_neg'   : 14
+                   , 'blindY_pos'   : 15
+                   , 'fit_time'     : 16
+                   , 'score_time'   : 17
+                   }
+
+scoreCV_mapD = {'test_mcc'         : 'MCC'
+                , 'test_fscore'    : 'F1'
+                , 'test_precision' : 'Precision'
+                , 'test_recall'    : 'Recall'
+                , 'test_accuracy'  : 'Accuracy'
+                , 'test_roc_auc'   : 'ROC_AUC'
+                , 'test_jcc'       : 'JCC'
+                }
+
+scoreBT_mapD = {'bts_mcc'          : 'MCC'
+                , 'bts_fscore'     : 'F1'
+                , 'bts_precision'  : 'Precision'
+                , 'bts_recall'     : 'Recall'
+                , 'bts_accuracy'   : 'Accuracy'
+                , 'bts_roc_auc'    : 'ROC_AUC'
+                , 'bts_jcc'        : 'JCC'
+               }
+
+#%%############################################################################
+############################
+# MultModelsCl()
+# Run Multiple Classifiers
+############################
+# Multiple Classification - Model Pipeline
+def MultModelsCl(input_df, target, skf_cv
+                       , blind_test_df
+                       , blind_test_target
+                       , tts_split_type 
+                       , run_blind_test = True
+
+                       , resampling_type = 'none' # default
+                       , add_cm = True # adds confusion matrix based on cross_val_predict
+                       , add_yn = True  # adds target var class numbers
+                       , var_type = ['numerical', 'categorical','mixed']
+                       , return_formatted_output = True):
+
+    '''
+    @ param input_df: input features 
+    @ type: df with input features WITHOUT the target variable
+    
+    @param target: target (or output) feature
+    @type: df or np.array or Series
+    
+    @param skv_cv: stratifiedK fold int or object to allow shuffle and random state to pass
+    @type: int or StratifiedKfold()
+    
+    @var_type: numerical, categorical and mixed to determine what col_transform to apply (MinMaxScalar and/or one-ho    t encoder)
+    @type: list
+
+    returns
+    Dict containing multiple classification scores for each model and mean of each Stratified Kfold including training
+    '''
+
+    #======================================================
+    # Determine categorical and numerical features
+    #======================================================
+    numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
+    numerical_ix
+    categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns
+    categorical_ix    
+
+    #======================================================
+    # Determine preprocessing steps ~ var_type
+    #======================================================
+    if var_type == 'numerical':
+        t = [('num', MinMaxScaler(), numerical_ix)]
+
+    if var_type == 'categorical':
+        t = [('cat', OneHotEncoder(), categorical_ix)]
+    
+    if var_type == 'mixed':
+        t = [('num', MinMaxScaler(), numerical_ix)
+            , ('cat', OneHotEncoder(), categorical_ix) ]
+        
+    col_transform = ColumnTransformer(transformers = t
+                                       , remainder='passthrough')
+    
+    #======================================================
+    # Specify multiple Classification Models  
+    #======================================================
+    models = [('AdaBoost Classifier'     , AdaBoostClassifier(**rs) )
+               , ('Bagging Classifier'        , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True) )
+               , ('Decision Tree'             , DecisionTreeClassifier(**rs) ) 
+               , ('Extra Tree'                , ExtraTreeClassifier(**rs) )
+               , ('Extra Trees'               , ExtraTreesClassifier(**rs) ) 
+               , ('Gradient Boosting'         , GradientBoostingClassifier(**rs) )
+               , ('Gaussian NB'               , GaussianNB() )
+               , ('Gaussian Process'          , GaussianProcessClassifier(**rs) )
+               , ('K-Nearest Neighbors'       , KNeighborsClassifier() ) 
+               , ('LDA'                       , LinearDiscriminantAnalysis() )
+               , ('Logistic Regression'       , LogisticRegression(**rs) )
+               , ('Logistic RegressionCV'     , LogisticRegressionCV(cv = 3, **rs))
+               , ('MLP'                       , MLPClassifier(max_iter = 500, **rs) ) 
+               , ('Multinomial'               , MultinomialNB() )
+               , ('Naive Bayes'               , BernoulliNB() )
+               , ('Passive Aggresive'         , PassiveAggressiveClassifier(**rs, **njobs) )
+               , ('QDA'                       , QuadraticDiscriminantAnalysis() )
+               , ('Random Forest'             , RandomForestClassifier(**rs, n_estimators = 1000 ) ) 
+               , ('Random Forest2'            , RandomForestClassifier(min_samples_leaf = 5
+                                                                       , n_estimators     = 1000
+                                                                       , bootstrap        = True
+                                                                       , oob_score        = True
+                                                                       , **njobs
+                                                                       , **rs
+                                                                       , max_features     = 'auto') ) 
+                , ('Ridge Classifier'          , RidgeClassifier(**rs)  )
+                , ('Ridge ClassifierCV'        , RidgeClassifierCV(cv = 3) )          
+                , ('SVC'                       , SVC(**rs) ) 
+                , ('Stochastic GDescent'       , SGDClassifier(**rs, **njobs) )
+                , ('XGBoost'                   , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False) )
+             ]
+                
+    mm_skf_scoresD = {}
+    
+    print('\n==============================================================\n'
+          , '\nRunning several classification models (n):', len(models)
+          ,'\nList of models:')
+    for m in models:
+        print(m)
+    print('\n================================================================\n')
+    
+    index = 1
+    for model_name, model_fn in models:
+        print('\nRunning classifier:', index
+              , '\nModel_name:'               , model_name
+              , '\nModel func:'               , model_fn)
+        index = index+1
+        
+        model_pipeline = Pipeline([
+            ('prep'     , col_transform)
+            , ('model'  , model_fn)])
+            
+        print('\nRunning model pipeline:', model_pipeline)
+        skf_cv_modD = cross_validate(model_pipeline
+                              , input_df
+                              , target
+                              , cv = skf_cv
+                              , scoring = scoring_fn
+                              , return_train_score = True)
+        #==============================
+        # Extract mean values for CV 
+        #==============================
+        mm_skf_scoresD[model_name] = {}
+        
+        for key, value in skf_cv_modD.items():
+            print('\nkey:', key, '\nvalue:', value)
+            print('\nmean value:', np.mean(value))
+            mm_skf_scoresD[model_name][key] = round(np.mean(value),2)
+            
+        # ADD more info: meta data related to input df
+        mm_skf_scoresD[model_name]['resampling']        = resampling_type
+        mm_skf_scoresD[model_name]['n_training_size']   = len(input_df)
+        mm_skf_scoresD[model_name]['n_trainingY_ratio'] = round(Counter(target)[0]/Counter(target)[1], 2)
+        mm_skf_scoresD[model_name]['n_features']        = len(input_df.columns)
+        mm_skf_scoresD[model_name]['tts_split']         = tts_split_type
+        
+        #######################################################################
+        #======================================================
+        # Option: Add confusion matrix from cross_val_predict
+        # Understand and USE with caution
+        #======================================================
+        if add_cm:  
+           cmD = {}
+
+            # Calculate cm         
+           y_pred   = cross_val_predict(model_pipeline, input_df, target, cv = skf_cv, **njobs)
+            #_tn, _fp, _fn, _tp = confusion_matrix(y_pred, y).ravel() # internally
+           tn, fp, fn, tp = confusion_matrix(y_pred, target).ravel()
+    
+            # Build cm dict
+           cmD = {'TN'  : tn
+                   , 'FP': fp
+                   , 'FN': fn
+                   , 'TP': tp}
+            
+            # Update cv dict cmD           
+           mm_skf_scoresD[model_name].update(cmD)
+            
+        #=============================================
+        # Option: Add targety numbers for data
+        #=============================================
+        if add_yn:   
+            tnD = {}
+                       
+            # Build tn numbers dict
+            tnD = {'n_trainingY_neg'    : Counter(target)[0]
+                   , 'n_trainingY_pos'  : Counter(target)[1] }
+            
+            # Update cv dict with cmD and tnD
+            mm_skf_scoresD[model_name].update(tnD)
+
+#%%
+        #=========================
+        # Option: Blind test (bts)
+        #=========================
+        if run_blind_test:
+           btD = {}
+           
+           # Build bts numbers dict
+           btD = {'n_blindY_neg'    : Counter(blind_test_target)[0]
+                  , 'n_blindY_pos'  : Counter(blind_test_target)[1]
+                  , 'n_testY_ratio' : round(Counter(blind_test_target)[0]/Counter(blind_test_target)[1], 2)
+                  , 'n_test_size'   : len(blind_test_df) }
+           
+           # Update cmD+tnD dicts with btD
+           mm_skf_scoresD[model_name].update(btD)
+                    
+           #--------------------------------------------------------
+           # Build the final results with all scores for the model
+           #--------------------------------------------------------
+           #bts_predict = gscv_fs.predict(blind_test_df)
+           model_pipeline.fit(input_df, target)
+           bts_predict = model_pipeline.predict(blind_test_df)
+           
+           bts_mcc_score = round(matthews_corrcoef(blind_test_target, bts_predict),2)
+           print('\nMCC on Blind test:'     , bts_mcc_score)
+           print('\nAccuracy on Blind test:', round(accuracy_score(blind_test_target, bts_predict),2))
+           
+           mm_skf_scoresD[model_name]['bts_mcc']       = bts_mcc_score
+           mm_skf_scoresD[model_name]['bts_fscore']    = round(f1_score(blind_test_target, bts_predict),2)
+           mm_skf_scoresD[model_name]['bts_precision'] = round(precision_score(blind_test_target, bts_predict),2)
+           mm_skf_scoresD[model_name]['bts_recall']    = round(recall_score(blind_test_target, bts_predict),2)
+           mm_skf_scoresD[model_name]['bts_accuracy']  = round(accuracy_score(blind_test_target, bts_predict),2)
+           mm_skf_scoresD[model_name]['bts_roc_auc']   = round(roc_auc_score(blind_test_target, bts_predict),2)
+           mm_skf_scoresD[model_name]['bts_jcc']       = round(jaccard_score(blind_test_target, bts_predict),2)
+           #mm_skf_scoresD[model_name]['diff_mcc']      = train_test_diff_MCC
+#%%
+        # ADD more info: meta data related to input and blind and resampling
+    
+        # target numbers: training
+        yc1           = Counter(target)
+        yc1_ratio     = yc1[0]/yc1[1]
+    
+        # target numbers: test
+        yc2       = Counter(blind_test_target)
+        yc2_ratio = yc2[0]/yc2[1]
+    
+        mm_skf_scoresD[model_name]['resampling']        = resampling_type
+        
+        mm_skf_scoresD[model_name]['n_training_size']   = len(input_df)
+        mm_skf_scoresD[model_name]['n_trainingY_ratio'] = round(yc1_ratio, 2)
+       
+        mm_skf_scoresD[model_name]['n_test_size']     = len(blind_test_df)
+        mm_skf_scoresD[model_name]['n_testY_ratio']   = round(yc2_ratio,2)
+        mm_skf_scoresD[model_name]['n_features']      = len(input_df.columns)
+        mm_skf_scoresD[model_name]['tts_split']       = tts_split_type
+
+    #return(mm_skf_scoresD)
+    #============================
+    # Process the dict to have WF
+    #============================
+    if return_formatted_output:
+        CV_BT_metaDF = ProcessMultModelsCl(mm_skf_scoresD)
+        return(CV_BT_metaDF)
+    else:
+        return(mm_skf_scoresD)
+
+#%% Process output function ###################################################
+############################
+# ProcessMultModelsCl() 
+############################
+#Processes the dict from above if use_formatted_output = True 
+
+def ProcessMultModelsCl(inputD = {}, blind_test_data = True):
+    
+    scoresDF = pd.DataFrame(inputD)
+    
+    #------------------------
+    #  Extracting split_name
+    #-----------------------
+    tts_split_nameL = []
+    for k,v in inputD.items():
+        tts_split_nameL = tts_split_nameL + [v['tts_split']]
+    
+    if len(set(tts_split_nameL)) == 1:
+        tts_split_name = str(list(set(tts_split_nameL))[0])
+        print('\nExtracting tts_split_name:', tts_split_name)
+    
+    #----------------------
+    #  WF: CV results
+    #----------------------
+    scoresDFT = scoresDF.T
+    
+    scoresDF_CV = scoresDFT.filter(regex='^test_.*$', axis = 1); scoresDF_CV.columns
+    # map colnames for consistency to allow concatenting
+    scoresDF_CV.columns = scoresDF_CV.columns.map(scoreCV_mapD); scoresDF_CV.columns
+    scoresDF_CV['source_data'] = 'CV'
+    
+    #----------------------
+    #  WF: Meta data 
+    #----------------------
+    metaDF = scoresDFT.filter(regex='^(?!test_.*$|bts_.*$|train_.*$).*'); metaDF.columns
+    
+    print('\nTotal cols in each df:'
+          , '\nCV df:', len(scoresDF_CV.columns)
+          , '\nmetaDF:', len(metaDF.columns))
+    
+    #-------------------------------------
+    # Combine WF: CV + Metadata
+    #-------------------------------------
+
+    combDF = pd.merge(scoresDF_CV, metaDF, left_index = True, right_index = True)  
+    print('\nAdding column: Model_name')
+    combDF['Model_name'] = combDF.index
+
+    #----------------------
+    #  WF: BTS results
+    #----------------------      
+    if blind_test_data:
+          
+        scoresDF_BT = scoresDFT.filter(regex='^bts_.*$', axis = 1); scoresDF_BT.columns
+        # map colnames for consistency to allow concatenting
+        scoresDF_BT.columns = scoresDF_BT.columns.map(scoreBT_mapD); scoresDF_BT.columns
+        scoresDF_BT['source_data'] = 'BT'
+    
+    
+        print('\nTotal cols in bts df:'
+              , '\nBT_df:', len(scoresDF_BT.columns))
+        
+        if  len(scoresDF_CV.columns) == len(scoresDF_BT.columns):
+            print('\nFirst proceeding to rowbind CV and BT dfs:')
+            expected_ncols_out = len(scoresDF_BT.columns) + len(metaDF.columns)
+            print('\nFinal output should have:', expected_ncols_out, 'columns' )
+
+        #-----------------
+        # Combine WF
+        #-----------------
+        dfs_combine_wf = [scoresDF_CV, scoresDF_BT]
+    
+        print('\nCombinig', len(dfs_combine_wf), 'using pd.concat by row ~ rowbind'
+              , '\nChecking Dims of df to combine:'
+              , '\nDim of CV:', scoresDF_CV.shape
+              , '\nDim of BT:', scoresDF_BT.shape)
+        #print(scoresDF_CV)
+        #print(scoresDF_BT)
+    
+        dfs_nrows_wf = []
+        for df in dfs_combine_wf:
+            dfs_nrows_wf = dfs_nrows_wf + [len(df)]
+        dfs_nrows_wf = max(dfs_nrows_wf)
+            
+        dfs_ncols_wf = []
+        for df in dfs_combine_wf:
+            dfs_ncols_wf = dfs_ncols_wf + [len(df.columns)]
+        dfs_ncols_wf = max(dfs_ncols_wf)
+        print(dfs_ncols_wf)
+        
+        expected_nrows_wf = len(dfs_combine_wf) * dfs_nrows_wf
+        expected_ncols_wf = dfs_ncols_wf
+        
+        common_cols_wf = list(set.intersection(*(set(df.columns) for df in dfs_combine_wf)))
+        print('\nNumber of Common columns:', dfs_ncols_wf
+              , '\nThese are:', common_cols_wf)
+    
+        if len(common_cols_wf) == dfs_ncols_wf :
+            combined_baseline_wf = pd.concat([df[common_cols_wf] for df in dfs_combine_wf], ignore_index=False)
+            print('\nConcatenating dfs with different resampling methods [WF]:'
+                  , '\nSplit type:', tts_split_name
+                  , '\nNo. of dfs combining:', len(dfs_combine_wf))
+            #print('\n================================================^^^^^^^^^^^^')
+            if len(combined_baseline_wf) == expected_nrows_wf  and len(combined_baseline_wf.columns) == expected_ncols_wf:
+                #print('\n================================================^^^^^^^^^^^^')
+    
+                print('\nPASS:', len(dfs_combine_wf), 'dfs successfully combined'
+                      , '\nnrows in combined_df_wf:', len(combined_baseline_wf)
+                      , '\nncols in combined_df_wf:', len(combined_baseline_wf.columns))
+            else:
+                print('\nFAIL: concatenating failed'
+                      , '\nExpected nrows:', expected_nrows_wf
+                      , '\nGot:', len(combined_baseline_wf)
+                      , '\nExpected ncols:', expected_ncols_wf
+                      , '\nGot:', len(combined_baseline_wf.columns))
+                sys.exit('\nFIRST IF FAILS')
+            ##
+            c1L = list(set(combined_baseline_wf.index))
+            c2L = list(metaDF.index)
+
+            #if set(c1L) == set(c2L):
+            if set(c1L) == set(c2L) and all(x in c2L for x in c1L) and all(x in c1L for x in c2L):
+                print('\nPASS: proceeding to merge metadata with CV and BT dfs')
+                combDF = pd.merge(combined_baseline_wf, metaDF, left_index = True, right_index = True)
+                print('\nAdding column: Model_name')
+                combDF['Model_name'] = combDF.index
+                
+            else:
+                sys.exit('\nFAIL: Could not merge metadata with CV and BT dfs')
+
+        else:
+            print('\nConcatenting dfs not possible [WF],check numbers ')    
+
+        #-------------------------------------
+        # Combine WF+Metadata: Final output
+        #-------------------------------------
+       
+        # if len(combDF.columns) == expected_ncols_out:
+        #     print('\nPASS: Combined df has expected ncols')
+        # else:
+        #     sys.exit('\nFAIL: Length mismatch for combined_df')
+            
+        # print('\nAdding column: Model_name')
+        # combDF['Model_name'] = combDF.index
+        
+        print('\n========================================================='
+              , '\nSUCCESS: Ran multiple classifiers'
+              , '\n=======================================================')
+
+    #resampling_methods_wf = combined_baseline_wf[['resampling']]
+    #resampling_methods_wf = resampling_methods_wf.drop_duplicates()
+              #, '\n', resampling_methods_wf)
+
+    return combDF
+
+###############################################################################
--- a/scripts/ml/ml_functions/SplitTTS.py
+++ b/scripts/ml/ml_functions/SplitTTS.py
@ -0,0 +1,287 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Sat Jun 25 11:07:30 2022
+
+@author: tanu
+"""
+
+import sys, os
+import pandas as pd
+import numpy as np
+import os, sys
+import pandas as pd
+import numpy as np
+print(np.__version__)
+print(pd.__version__)
+import pprint as pp
+from copy import deepcopy
+from collections import Counter
+from sklearn.impute import KNNImputer as KNN
+from imblearn.over_sampling import RandomOverSampler
+from imblearn.under_sampling import RandomUnderSampler
+from imblearn.over_sampling import SMOTE
+from sklearn.datasets import make_classification
+from imblearn.combine import SMOTEENN
+from imblearn.combine import SMOTETomek
+
+from imblearn.over_sampling import SMOTENC
+from imblearn.under_sampling import EditedNearestNeighbours
+from imblearn.under_sampling import RepeatedEditedNearestNeighbours
+
+from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score
+from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report
+
+from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
+from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold
+
+from sklearn.pipeline import Pipeline, make_pipeline
+import argparse
+import re
+homedir = os.path.expanduser("~")
+#%% GLOBALS
+rs = {'random_state': 42}
+njobs = {'n_jobs': 10}
+
+#%% Define split_tts function #################################################
+def split_tts(ml_input_data
+              , data_type      = ['actual', 'complete']
+              , split_type     = ['70_30', '80_20', 'sl']
+              , oversampling   = True
+              , dst_colname    = 'dst'# determine how to subset the actual vs reverse data
+              , target_colname = 'dst_mode'
+              , include_gene_name = True
+              , k_smote = 5):
+    
+    outDict = {}
+    
+    print('\nInput params:' 
+          , '\nDim of input df:'   , ml_input_data.shape
+          , '\nData type to split:', data_type
+          , '\nSplit type:'        , split_type
+          , '\ntarget colname:'    , target_colname)
+        
+    if oversampling:
+        print('\noversampling enabled')
+    else:
+        print('\nNot generating oversampled or undersampled data')
+    
+    if include_gene_name:
+        cols_to_dropL = []
+    else:
+        cols_to_dropL = ['gene_name']
+
+    #====================================
+    # evaluating use_data_type
+    #====================================
+    if data_type == 'actual':
+        ml_data = ml_input_data[ml_input_data[dst_colname].notna()]
+    if data_type == 'complete':
+        ml_data = ml_input_data.copy()
+        
+    #====================================
+    # separate features and target
+    #====================================
+    cols_to_dropL = cols_to_dropL + [target_colname, dst_colname]
+    x_features    = ml_data.drop(cols_to_dropL, axis = 1)
+    y_target      = ml_data[target_colname]
+        
+    # sanity check
+    check1 = x_features[[i for i in cols_to_dropL if i in x_features.columns]]
+    
+    #if not 'dst_mode' in x_features.columns:
+    if check1.empty:
+        print('\nPASS: x_features has no target variable and no dst column'
+              , '\nDropped cols:', len(cols_to_dropL)
+              , '\nThese were:', target_colname,'and', dst_colname)
+        x_ncols = len(x_features.columns)
+        print('\nNo. of cols in input df:', len(ml_input_data.columns)
+              , '\nNo.of cols dropped:', len(cols_to_dropL)
+              , '\nNo. of columns for x_features:', x_ncols)
+    else:
+        sys.exit('\nFAIL: x_features has target variable included. FIX it and rerun!')
+        
+    #====================================
+    # Train test split
+    # with stratification
+    #=====================================
+    if split_type == '70_30':
+        tts_test_size = 0.33
+    if split_type == '80_20':
+        tts_test_size = 0.2
+    if split_type == 'sl':
+        tts_test_size = 1/np.sqrt(x_ncols)
+        train_sl = 1 - tts_test_size
+    
+    #-------------------------
+    #  TTS split ~ split_type
+    #-------------------------
+    #x_train, x_test, y_train, y_test # traditional var_names
+    # so my downstream code doesn't need to change    
+    X, X_bts, y, y_bts = train_test_split(x_features, y_target
+                                                    , test_size = tts_test_size
+                                                    , **rs
+                                                    , stratify = y_target)
+    yc1 = Counter(y)
+    yc1_ratio = yc1[0]/yc1[1]
+    
+    yc2 = Counter(y_bts)
+    yc2_ratio = yc2[0]/yc2[1]
+    ###############################################################################
+    #======================================================
+    # Determine categorical and numerical features
+    #======================================================
+    numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
+    numerical_cols 
+    categorical_cols = X.select_dtypes(include=['object', 'bool']).columns
+    categorical_cols 
+    ###############################################################################
+    print('\n-------------------------------------------------------------'
+          , '\nSuccessfully generated training and test data:'
+          , '\nData used:' , data_type
+          , '\nSplit type:', split_type
+
+          , '\n\nTotal no. of input features:'      , len(X.columns)
+          , '\n--------No. of numerical features:'  , len(numerical_cols)
+          , '\n--------No. of categorical features:', len(categorical_cols)
+          
+          , '\n==========================='
+          , '\n Resampling: NONE'
+          , '\nBaseline'
+          , '\n==========================='
+          
+          , '\n\nTotal data size:', len(X) + len(X_bts)
+    
+          , '\n\nTrain data size:', X.shape
+          , '\ny_train numbers:', yc1
+    
+          , '\n\nTest data size:', X_bts.shape
+          , '\ny_test_numbers:', yc2
+    
+          , '\n\ny_train ratio:',yc1_ratio
+          , '\ny_test ratio:', yc2_ratio
+          , '\n-------------------------------------------------------------')
+    
+    outDict.update({'X'       : X
+            , 'X_bts' : X_bts
+            , 'y'     : y
+            , 'y_bts' : y_bts
+            } ) 
+    
+    if oversampling:
+        #######################################################################
+        #                               RESAMPLING
+        #######################################################################
+        #------------------------------
+        # Simple Random oversampling
+        # [Numerical + catgeorical]
+        #------------------------------
+        oversample = RandomOverSampler(sampling_strategy='minority')
+        X_ros, y_ros = oversample.fit_resample(X, y)
+        print('\nSimple Random OverSampling\n', Counter(y_ros))
+        print(X_ros.shape)
+        
+        #------------------------------
+        # Simple Random Undersampling
+        # [Numerical + catgeorical]
+        #------------------------------
+        undersample = RandomUnderSampler(sampling_strategy='majority')
+        X_rus, y_rus = undersample.fit_resample(X, y)
+        print('\nSimple Random UnderSampling\n', Counter(y_rus))
+        print(X_rus.shape)
+        
+        #------------------------------
+        # Simple combine ROS and RUS
+        # [Numerical + catgeorical]
+        #------------------------------
+        oversample = RandomOverSampler(sampling_strategy='minority')
+        X_ros, y_ros = oversample.fit_resample(X, y)
+        undersample = RandomUnderSampler(sampling_strategy='majority')
+        X_rouC, y_rouC = undersample.fit_resample(X_ros, y_ros)
+        print('\nSimple Combined Over and UnderSampling\n',  Counter(y_rouC))
+        print(X_rouC.shape)
+        
+        #------------------------------
+        # SMOTE_NC: oversampling 
+        # [numerical + categorical]
+        #https://stackoverflow.com/questions/47655813/oversampling-smote-for-binary-and-categorical-data-in-python
+        #------------------------------
+        # Determine categorical and numerical features
+        numerical_ix = X.select_dtypes(include=['int64', 'float64']).columns
+        numerical_ix
+        num_featuresL = list(numerical_ix)
+        numerical_colind = X.columns.get_indexer(list(numerical_ix) )
+        numerical_colind
+        
+        categorical_ix = X.select_dtypes(include=['object', 'bool']).columns
+        categorical_ix    
+        categorical_colind = X.columns.get_indexer(list(categorical_ix))
+        categorical_colind
+        
+        #k_sm = 5 # default
+        k_sm = k_smote
+        sm_nc = SMOTENC(categorical_features=categorical_colind, k_neighbors = k_sm, **rs, **njobs)
+        X_smnc, y_smnc = sm_nc.fit_resample(X, y)
+        print('\nSMOTE_NC OverSampling\n', Counter(y_smnc))
+        print(X_smnc.shape)
+        
+        print('\nGenerated Resampled data as below:'
+            , '\n================================='
+            , '\nResampling: Random oversampling'
+            , '\n================================'
+             
+            , '\n\nTrain data size:', X_ros.shape
+            , '\ny_train numbers:', len(y_ros)
+            , '\n\ny_train ratio:', Counter(y_ros)[0]/Counter(y_ros)[1]
+            
+            , '\ny_test ratio:' , yc2_ratio
+            ##################################################################
+            , '\n================================'
+            , '\nResampling: Random underampling'
+            , '\n================================'
+            
+            , '\n\nTrain data size:', X_rus.shape
+            , '\ny_train numbers:', len(y_rus)
+            , '\n\ny_train ratio:', Counter(y_rus)[0]/Counter(y_rus)[1]
+            
+            , '\ny_test ratio:' , yc2_ratio
+            ##################################################################
+            , '\n================================'
+            , '\nResampling:Combined (over+under)'
+            , '\n================================'
+                                    
+            , '\n\nTrain data size:', X_rouC.shape
+            , '\ny_train numbers:', len(y_rouC)
+            , '\n\ny_train ratio:', Counter(y_rouC)[0]/Counter(y_rouC)[1]
+            
+            , '\ny_test ratio:' , yc2_ratio
+            ##################################################################            
+            , '\n=============================='
+            , '\nResampling: Smote NC'
+            , '\n=============================='
+            
+            , '\n\nTrain data size:', X_smnc.shape
+            , '\ny_train numbers:', len(y_smnc)
+            , '\n\ny_train ratio:', Counter(y_smnc)[0]/Counter(y_smnc)[1]
+            
+            , '\ny_test ratio:' , yc2_ratio
+            ##################################################################
+           , '\n-------------------------------------------------------------')
+
+        outDict.update({'X_ros'   : X_ros
+                        , 'y_ros' : y_ros
+                            
+                        , 'X_rus' : X_rus
+                        , 'y_rus' : y_rus
+                            
+                        , 'X_rouC': X_rouC
+                        , 'y_rouC': y_rouC
+                            
+                        , 'X_smnc': X_smnc
+                        , 'y_smnc': y_smnc})
+        return(outDict)
+        
+     #   globals().update(locals()) # TROLOLOLOLOLOLS
+     
+    else:
+        return(outDict)