added ml_functions dir

2022-06-29 12:06:47 +01:00 · 2022-06-29 12:06:47 +01:00 · 9aadb0329f
commit 9aadb0329f
parent c85c965c3e
30 changed files with 683 additions and 606160 deletions
--- a/scripts/ml/combined_model/ml_data_combined
+++ b/scripts/ml/combined_model/ml_data_combined
@ -1,73 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Sat Jun 25 11:07:30 2022
-
-@author: tanu
-"""
-
-import sys, os
-import pandas as pd
-import numpy as np
-import re
-
-###############################################################################
-homedir = os.path.expanduser("~")
-sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml/functions')
-###############################################################################
-#====================
-# Import ML functions 
-#====================
-#from MultClfs import *
-from GetMLData import *
-from SplitTTS import *
-#%% Load all gene files #######################################################
-# param dict
-combined_model_paramD = {'data_combined_model'   : True
-                    , 'use_or'                   : False
-                    , 'omit_all_genomic_features': False
-                    , 'write_maskfile'           : False
-                    , 'write_outfile'            : False }
-
-pnca_df = getmldata('pncA', 'pyrazinamide' , **combined_model_paramD)
-embb_df = getmldata('embB', 'ethambutol'   , **combined_model_paramD)
-katg_df = getmldata('katG', 'isoniazid'    , **combined_model_paramD)
-rpob_df = getmldata('rpoB', 'rifampicin'   , **combined_model_paramD)
-gid_df  = getmldata('gid' , 'streptomycin' , **combined_model_paramD)
-alr_df  = getmldata('alr' , 'cycloserine'  , **combined_model_paramD)
-
-# quick check
-foo = pd.concat([alr_df, pnca_df])
-check1 = foo.filter(regex= '.*_affinity|gene_name|ligand_distance', axis = 1)
-# So, pd.concat will join correctly but introduce NAs.
-# TODO: discuss whether to make these 0 and use it or just omit
-# For now I am omitting these i.e combining only on common columns
-
-expected_nrows  = len(pnca_df) + len(embb_df) + len(katg_df) + len(rpob_df) + len(gid_df) + len(alr_df)
-
-# finding common columns
-dfs_combine = [pnca_df, embb_df, katg_df, rpob_df, gid_df, alr_df]
-common_cols = list(set.intersection(*(set(df.columns) for df in dfs_combine)))
-expected_ncols = np.min([len(pnca_df.columns)] + [len(embb_df.columns)] + [len(katg_df.columns)] + [len(rpob_df.columns)] + [len(gid_df.columns)] + [len(alr_df.columns)])                                                                                                                              
-expected_ncols
-
-if len(common_cols) == expected_ncols:
-    print('\nProceeding to combine based on common cols (n):', len(common_cols))
-    combined_df = pd.concat([df[common_cols] for df in dfs_combine], ignore_index = False)
-    print('\nSuccessfully combined dfs:'
-          , '\nNo. of dfs combined:', len(dfs_combine)
-          , '\nDim of combined df:', combined_df.shape)
-else:
-    print('\nFAIL: could not combine dfs, length mismatch'
-          , '\nExpected ncols:', expected_ncols
-          , '\nGot:',  len(common_cols))
-#%% split data into different data types
-tts_7030_paramD = {'data_type'    : 'actual'
-              , 'split_type'      : '70_30'
-              , 'oversampling'    : True}
-                   
-data_CM_7030D = split_tts(ml_input_data = combined_df
-          , **tts_7030_paramD
-          , dst_colname = 'dst'
-          , target_colname = 'dst_mode'
-          , include_gene_name = False) # when not doing leave one group out        
--- a/scripts/ml/functions/MultClfs.py
+++ b/scripts/ml/functions/MultClfs.py
@ -1,833 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Fri Mar  4 15:25:33 2022
-
-@author: tanu
-"""
-#%%
-import os, sys
-import pandas as pd
-import numpy as np
-import pprint as pp
-from copy import deepcopy
-from sklearn import linear_model
-from sklearn import datasets
-from collections import Counter
-
-from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
-from sklearn.linear_model import RidgeClassifier, RidgeClassifierCV, SGDClassifier, PassiveAggressiveClassifier
-
-from sklearn.naive_bayes import BernoulliNB
-from sklearn.neighbors import KNeighborsClassifier
-from sklearn.svm import SVC
-from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
-from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier
-from sklearn.naive_bayes import GaussianNB
-from sklearn.gaussian_process import GaussianProcessClassifier, kernels
-from sklearn.gaussian_process.kernels import RBF, DotProduct, Matern, RationalQuadratic, WhiteKernel
-
-from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
-from sklearn.neural_network import MLPClassifier
-
-from sklearn.svm import SVC
-from xgboost import XGBClassifier
-from sklearn.naive_bayes import MultinomialNB
-from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
-
-from sklearn.compose import ColumnTransformer
-from sklearn.compose import make_column_transformer
-
-from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score
-from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report
-
-# added
-from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, LeaveOneOut, KFold, RepeatedKFold, cross_val_predict
-
-from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
-from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold
-
-from sklearn.pipeline import Pipeline, make_pipeline
-
-from sklearn.feature_selection import RFE, RFECV
-
-import itertools
-import seaborn as sns
-import matplotlib.pyplot as plt
-
-from statistics import mean, stdev, median, mode
-
-from imblearn.over_sampling import RandomOverSampler
-from imblearn.under_sampling import RandomUnderSampler
-from imblearn.over_sampling import SMOTE
-from sklearn.datasets import make_classification
-from imblearn.combine import SMOTEENN
-from imblearn.combine import SMOTETomek
-
-from imblearn.over_sampling import SMOTENC
-from imblearn.under_sampling import EditedNearestNeighbours
-from imblearn.under_sampling import RepeatedEditedNearestNeighbours
-
-from sklearn.model_selection import GridSearchCV
-from sklearn.base import BaseEstimator
-from sklearn.impute import KNNImputer as KNN
-import json
-import argparse
-import re
-#%% GLOBALS
-rs = {'random_state': 42}
-njobs = {'n_jobs': 10}
-
-scoring_fn =  ({ 'mcc'        : make_scorer(matthews_corrcoef)
-                , 'fscore'    : make_scorer(f1_score)
-                , 'precision' : make_scorer(precision_score)
-                , 'recall'    : make_scorer(recall_score)
-                , 'accuracy'  : make_scorer(accuracy_score)
-                , 'roc_auc'   : make_scorer(roc_auc_score)
-                , 'jcc'       : make_scorer(jaccard_score)
-            }) 
-  
-skf_cv = StratifiedKFold(n_splits = 10
-                          #, shuffle = False, random_state= None)
-                           , shuffle = True,**rs)
-
-rskf_cv = RepeatedStratifiedKFold(n_splits = 10
-                                  , n_repeats = 3
-                                  , **rs)
-
-mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)}
-jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
-
-###############################################################################
-score_type_ordermapD = { 'mcc'      : 1
-                   , 'fscore'       : 2
-                   , 'jcc'          : 3
-                   , 'precision'    : 4
-                   , 'recall'       : 5      
-                   , 'accuracy'     : 6  
-                   , 'roc_auc'      : 7
-                   , 'TN'           : 8
-                   , 'FP'           : 9
-                   , 'FN'           : 10
-                   , 'TP'           : 11  
-                   , 'trainingY_neg': 12  
-                   , 'trainingY_pos': 13    
-                   , 'blindY_neg'   : 14
-                   , 'blindY_pos'   : 15
-                   , 'fit_time'     : 16
-                   , 'score_time'   : 17
-                   }
-
-scoreCV_mapD = {'test_mcc'         : 'MCC'
-                , 'test_fscore'    : 'F1'
-                , 'test_precision' : 'Precision'
-                , 'test_recall'    : 'Recall'
-                , 'test_accuracy'  : 'Accuracy'
-                , 'test_roc_auc'   : 'ROC_AUC'
-                , 'test_jcc'       : 'JCC'
-                }
-
-scoreBT_mapD = {'bts_mcc'          : 'MCC'
-                , 'bts_fscore'     : 'F1'
-                , 'bts_precision'  : 'Precision'
-                , 'bts_recall'     : 'Recall'
-                , 'bts_accuracy'   : 'Accuracy'
-                , 'bts_roc_auc'    : 'ROC_AUC'
-                , 'bts_jcc'        : 'JCC'
-               }
-
-#%%############################################################################
-############################
-# MultModelsCl()
-# Run Multiple Classifiers
-############################
-# Multiple Classification - Model Pipeline
-def MultModelsCl(input_df, target, skf_cv
-                       , blind_test_df
-                       , blind_test_target
-                       , tts_split_type 
-                       , resampling_type = 'none' # default
-                       , add_cm = True # adds confusion matrix based on cross_val_predict
-                       , add_yn = True  # adds target var class numbers
-                       , var_type = ['numerical', 'categorical','mixed']
-                       , return_formatted_output = True):
-
-    '''
-    @ param input_df: input features 
-    @ type: df with input features WITHOUT the target variable
-    
-    @param target: target (or output) feature
-    @type: df or np.array or Series
-    
-    @param skv_cv: stratifiedK fold int or object to allow shuffle and random state to pass
-    @type: int or StratifiedKfold()
-    
-    @var_type: numerical, categorical and mixed to determine what col_transform to apply (MinMaxScalar and/or one-ho    t encoder)
-    @type: list
-
-    returns
-    Dict containing multiple classification scores for each model and mean of each Stratified Kfold including training
-    '''
-
-    #======================================================
-    # Determine categorical and numerical features
-    #======================================================
-    numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
-    numerical_ix
-    categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns
-    categorical_ix    
-
-    #======================================================
-    # Determine preprocessing steps ~ var_type
-    #======================================================
-    if var_type == 'numerical':
-        t = [('num', MinMaxScaler(), numerical_ix)]
-
-    if var_type == 'categorical':
-        t = [('cat', OneHotEncoder(), categorical_ix)]
-    
-    if var_type == 'mixed':
-        t = [('num', MinMaxScaler(), numerical_ix)
-            , ('cat', OneHotEncoder(), categorical_ix) ]
-        
-    col_transform = ColumnTransformer(transformers = t
-                                       , remainder='passthrough')
-    
-    #======================================================
-    # Specify multiple Classification Models  
-    #======================================================
-    models = [('AdaBoost Classifier'     , AdaBoostClassifier(**rs) )
-               , ('Bagging Classifier'        , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True) )
-               , ('Decision Tree'             , DecisionTreeClassifier(**rs) ) 
-               , ('Extra Tree'                , ExtraTreeClassifier(**rs) )
-               , ('Extra Trees'               , ExtraTreesClassifier(**rs) ) 
-               , ('Gradient Boosting'         , GradientBoostingClassifier(**rs) )
-               , ('Gaussian NB'               , GaussianNB() )
-               , ('Gaussian Process'          , GaussianProcessClassifier(**rs) )
-               , ('K-Nearest Neighbors'       , KNeighborsClassifier() ) 
-               , ('LDA'                       , LinearDiscriminantAnalysis() )
-               , ('Logistic Regression'       , LogisticRegression(**rs) )
-               , ('Logistic RegressionCV'     , LogisticRegressionCV(cv = 3, **rs))
-               , ('MLP'                       , MLPClassifier(max_iter = 500, **rs) ) 
-               , ('Multinomial'               , MultinomialNB() )
-               , ('Naive Bayes'               , BernoulliNB() )
-               , ('Passive Aggresive'         , PassiveAggressiveClassifier(**rs, **njobs) )
-               , ('QDA'                       , QuadraticDiscriminantAnalysis() )
-               , ('Random Forest'             , RandomForestClassifier(**rs, n_estimators = 1000 ) ) 
-               , ('Random Forest2'            , RandomForestClassifier(min_samples_leaf = 5
-                                                                       , n_estimators     = 1000
-                                                                       , bootstrap        = True
-                                                                       , oob_score        = True
-                                                                       , **njobs
-                                                                       , **rs
-                                                                       , max_features     = 'auto') ) 
-                , ('Ridge Classifier'          , RidgeClassifier(**rs)  )
-                , ('Ridge ClassifierCV'        , RidgeClassifierCV(cv = 3) )          
-                , ('SVC'                       , SVC(**rs) ) 
-                , ('Stochastic GDescent'       , SGDClassifier(**rs, **njobs) )
-                , ('XGBoost'                   , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False) )
-             ]
-                
-    mm_skf_scoresD = {}
-    
-    print('\n==============================================================\n'
-          , '\nRunning several classification models (n):', len(models)
-          ,'\nList of models:')
-    for m in models:
-        print(m)
-    print('\n================================================================\n')
-    
-    index = 1
-    for model_name, model_fn in models:
-        print('\nRunning classifier:', index
-              , '\nModel_name:'               , model_name
-              , '\nModel func:'               , model_fn)
-        index = index+1
-        
-        model_pipeline = Pipeline([
-            ('prep'     , col_transform)
-            , ('model'  , model_fn)])
-            
-        print('\nRunning model pipeline:', model_pipeline)
-        skf_cv_modD = cross_validate(model_pipeline
-                              , input_df
-                              , target
-                              , cv = skf_cv
-                              , scoring = scoring_fn
-                              , return_train_score = True)
-        
-        #######################################################################
-        #======================================================
-        # Option: Add confusion matrix from cross_val_predict
-        # Understand and USE with caution
-        # cross_val_score, cross_val_predict, "Passing these predictions into an evaluation metric may not be a valid way to measure generalization performance. Results can differ from cross_validate and cross_val_score unless all tests sets have equal size and the metric decomposes over samples."
-        # https://stackoverflow.com/questions/65645125/producing-a-confusion-matrix-with-cross-validate
-        #======================================================
-        if add_cm:  
-            
-            #-----------------------------------------------------------
-            # Initialise dict of Confusion Matrix (cm)
-            #-----------------------------------------------------------
-            cmD = {}
-            
-            # Calculate cm         
-            y_pred   = cross_val_predict(model_pipeline, input_df, target, cv = skf_cv, **njobs)
-            #_tn, _fp, _fn, _tp = confusion_matrix(y_pred, y).ravel() # internally
-            tn, fp, fn, tp = confusion_matrix(y_pred, target).ravel()
-    
-            # Build dict
-
-            cmD = {'TN'  : tn
-                   , 'FP': fp
-                   , 'FN': fn
-                   , 'TP': tp}
-            #---------------------------------       
-            # Update cv dict with cmD and tbtD
-            #----------------------------------
-            skf_cv_modD.update(cmD)
-        else:
-            skf_cv_modD = skf_cv_modD
-        #######################################################################            
-        #=============================================
-        # Option: Add targety numbers for data
-        #=============================================
-        if add_yn:    
-            
-            #-----------------------------------------------------------
-            # Initialise dict of target numbers: training and blind  (tbt)
-            #-----------------------------------------------------------
-            tbtD = {}
-        
-            # training y
-            tyn = Counter(target)
-            tyn_neg = tyn[0]
-            tyn_pos = tyn[1]
-    
-            # blind test y
-            btyn = Counter(blind_test_target)
-            btyn_neg = btyn[0]
-            btyn_pos = btyn[1]
-                    
-            # Build dict
-            tbtD = {'n_trainingY_neg'  : tyn_neg
-                   , 'n_trainingY_pos' : tyn_pos
-                   , 'n_blindY_neg'    : btyn_neg
-                   , 'n_blindY_pos'    : btyn_pos}
-            
-            #---------------------------------       
-            # Update cv dict with cmD and tbtD
-            #----------------------------------
-            skf_cv_modD.update(tbtD)
-        else:
-            skf_cv_modD = skf_cv_modD
-        
-        #######################################################################    
-        #==============================
-        # Extract mean values for CV 
-        #==============================
-        mm_skf_scoresD[model_name] = {}
-        
-        for key, value in skf_cv_modD.items():
-            print('\nkey:', key, '\nvalue:', value)
-            print('\nmean value:', np.mean(value))
-            mm_skf_scoresD[model_name][key] = round(np.mean(value),2)
-
-    #return(mm_skf_scoresD)
-#%%
-        #=========================
-        # Blind test: BTS results
-        #=========================
-        # Build the final results with all scores for the model
-        #bts_predict = gscv_fs.predict(blind_test_df)
-        model_pipeline.fit(input_df, target)
-        bts_predict = model_pipeline.predict(blind_test_df)
-        
-        bts_mcc_score = round(matthews_corrcoef(blind_test_target, bts_predict),2)
-        print('\nMCC on Blind test:'     , bts_mcc_score)
-        print('\nAccuracy on Blind test:', round(accuracy_score(blind_test_target, bts_predict),2))
-        
-        # Diff b/w train and bts test scores
-        # train_test_diff_MCC = cvtrain_mcc - bts_mcc_score
-        # print('\nDiff b/w train and blind test score (MCC):', train_test_diff)
-        
-        mm_skf_scoresD[model_name]['bts_mcc']       = bts_mcc_score
-        mm_skf_scoresD[model_name]['bts_fscore']    = round(f1_score(blind_test_target, bts_predict),2)
-        mm_skf_scoresD[model_name]['bts_precision'] = round(precision_score(blind_test_target, bts_predict),2)
-        mm_skf_scoresD[model_name]['bts_recall']    = round(recall_score(blind_test_target, bts_predict),2)
-        mm_skf_scoresD[model_name]['bts_accuracy']  = round(accuracy_score(blind_test_target, bts_predict),2)
-        mm_skf_scoresD[model_name]['bts_roc_auc']   = round(roc_auc_score(blind_test_target, bts_predict),2)
-        mm_skf_scoresD[model_name]['bts_jcc']       = round(jaccard_score(blind_test_target, bts_predict),2)
-        #mm_skf_scoresD[model_name]['diff_mcc']      = train_test_diff_MCC
-
-    #return(mm_skf_scoresD)
-#%%
-        # ADD more info: meta data related to input and blind and resampling
-    
-        # target numbers: training
-        yc1           = Counter(target)
-        yc1_ratio     = yc1[0]/yc1[1]
-    
-        # target numbers: test
-        yc2       = Counter(blind_test_target)
-        yc2_ratio = yc2[0]/yc2[1]
-    
-        mm_skf_scoresD[model_name]['resampling']        = resampling_type
-        
-        mm_skf_scoresD[model_name]['n_training_size']   = len(input_df)
-        mm_skf_scoresD[model_name]['n_trainingY_ratio'] = round(yc1_ratio, 2)
-       
-        mm_skf_scoresD[model_name]['n_test_size']     = len(blind_test_df)
-        mm_skf_scoresD[model_name]['n_testY_ratio']   = round(yc2_ratio,2)
-        mm_skf_scoresD[model_name]['n_features']      = len(input_df.columns)
-        mm_skf_scoresD[model_name]['tts_split']       = tts_split_type
-
-    #return(mm_skf_scoresD)
-    #============================
-    # Process the dict to have WF
-    #============================
-    if return_formatted_output: 
-        CV_BT_metaDF = ProcessMultModelsCl(mm_skf_scoresD)
-        return(CV_BT_metaDF)
-    else:
-        return(mm_skf_scoresD)
-
-#%% Process output function ###################################################
-############################
-# ProcessMultModelsCl() 
-############################
-#Processes the dict from above if use_formatted_output = True 
-
-def ProcessMultModelsCl(inputD = {}):
-    
-    scoresDF = pd.DataFrame(inputD)
-    
-    #------------------------
-    #  Extracting split_name
-    #-----------------------
-    tts_split_nameL = []
-    for k,v in inputD.items():
-        tts_split_nameL = tts_split_nameL + [v['tts_split']]
-    
-    if len(set(tts_split_nameL)) == 1:
-        tts_split_name = str(list(set(tts_split_nameL))[0])
-        print('\nExtracting tts_split_name:', tts_split_name)
-    
-    #------------------------
-    #  WF: only CV and BTS
-    #-----------------------
-    scoresDFT = scoresDF.T
-    
-    scoresDF_CV = scoresDFT.filter(regex='^test_.*$', axis = 1); scoresDF_CV.columns
-    # map colnames for consistency to allow concatenting
-    scoresDF_CV.columns = scoresDF_CV.columns.map(scoreCV_mapD); scoresDF_CV.columns
-    scoresDF_CV['source_data'] = 'CV'
-    
-    scoresDF_BT = scoresDFT.filter(regex='^bts_.*$', axis = 1); scoresDF_BT.columns
-    # map colnames for consistency to allow concatenting
-    scoresDF_BT.columns = scoresDF_BT.columns.map(scoreBT_mapD); scoresDF_BT.columns
-    scoresDF_BT['source_data'] = 'BT'
-    
-    # dfs_combine_wf = [baseline_BT, smnc_BT, ros_BT, rus_BT, rouC_BT,
-    #                   baseline_CV, smnc_CV, ros_CV, rus_CV, rouC_CV]
-    
-    #baseline_all = baseline_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
-
-    #metaDF = scoresDFT.filter(regex='training_size|blind_test_size|_time|TN|FP|FN|TP|.*_neg|.*_pos|resampling', axis = 1); scoresDF_BT.columns
-    #metaDF = scoresDFT.filter(regex='n_.*$|_time|TN|FP|FN|TP|.*_neg|.*_pos|resampling|tts.*', axis = 1); metaDF.columns
-    metaDF = scoresDFT.filter(regex='^(?!test_.*$|bts_.*$|train_.*$).*'); metaDF.columns
-    
-    print('\nTotal cols in each df:'
-          , '\nCV df:', len(scoresDF_CV.columns)
-          , '\nBT_df:', len(scoresDF_BT.columns)
-          , '\nmetaDF:', len(metaDF.columns))
-    
-    if  len(scoresDF_CV.columns) == len(scoresDF_BT.columns):
-        print('\nFirst proceeding to rowbind CV and BT dfs:')
-        expected_ncols_out = len(scoresDF_BT.columns) + len(metaDF.columns)
-        print('\nFinal output should have:', expected_ncols_out, 'columns' )
-
-    #-----------------
-    # Combine WF
-    #-----------------
-    dfs_combine_wf = [scoresDF_CV, scoresDF_BT]
-
-    print('\nCombinig', len(dfs_combine_wf), 'using pd.concat by row ~ rowbind'
-          , '\nChecking Dims of df to combine:'
-          , '\nDim of CV:', scoresDF_CV.shape
-          , '\nDim of BT:', scoresDF_BT.shape)
-    #print(scoresDF_CV)
-    #print(scoresDF_BT)
-
-    dfs_nrows_wf = []
-    for df in dfs_combine_wf:
-        dfs_nrows_wf = dfs_nrows_wf + [len(df)]
-    dfs_nrows_wf = max(dfs_nrows_wf)
-        
-    dfs_ncols_wf = []
-    for df in dfs_combine_wf:
-        dfs_ncols_wf = dfs_ncols_wf + [len(df.columns)]
-    dfs_ncols_wf = max(dfs_ncols_wf)
-    print(dfs_ncols_wf)
-    
-    expected_nrows_wf = len(dfs_combine_wf) * dfs_nrows_wf
-    expected_ncols_wf = dfs_ncols_wf
-    
-    common_cols_wf = list(set.intersection(*(set(df.columns) for df in dfs_combine_wf)))
-    print('\nNumber of Common columns:', dfs_ncols_wf
-          , '\nThese are:', common_cols_wf)
-    
-    if len(common_cols_wf) == dfs_ncols_wf :
-        combined_baseline_wf = pd.concat([df[common_cols_wf] for df in dfs_combine_wf], ignore_index=False)
-        print('\nConcatenating dfs with different resampling methods [WF]:'
-              , '\nSplit type:', tts_split_name
-              , '\nNo. of dfs combining:', len(dfs_combine_wf))
-        #print('\n================================================^^^^^^^^^^^^')
-        if len(combined_baseline_wf) == expected_nrows_wf  and len(combined_baseline_wf.columns) == expected_ncols_wf:
-            #print('\n================================================^^^^^^^^^^^^')
-
-            print('\nPASS:', len(dfs_combine_wf), 'dfs successfully combined'
-                  , '\nnrows in combined_df_wf:', len(combined_baseline_wf)
-                  , '\nncols in combined_df_wf:', len(combined_baseline_wf.columns))
-        else:
-            print('\nFAIL: concatenating failed'
-                  , '\nExpected nrows:', expected_nrows_wf
-                  , '\nGot:', len(combined_baseline_wf)
-                  , '\nExpected ncols:', expected_ncols_wf
-                  , '\nGot:', len(combined_baseline_wf.columns))
-            sys.exit('\nFIRST IF FAILS')
-    else:
-        print('\nConcatenting dfs not possible [WF],check numbers ')    
-
-    #-------------------------------------
-    # Combine WF+Metadata: Final output
-    #-------------------------------------
-    # checking indices for the dfs to combine:
-    c1L = list(set(combined_baseline_wf.index))
-    c2L = list(metaDF.index)
-    
-    #if set(c1L) == set(c2L):
-    if set(c1L) == set(c2L) and all(x in c2L for x in c1L) and all(x in c1L for x in c2L):
-        print('\nPASS: proceeding to merge metadata with CV and BT dfs')
-        combDF = pd.merge(combined_baseline_wf, metaDF, left_index = True, right_index = True)
-    else:
-        sys.exit('\nFAIL: Could not merge metadata with CV and BT dfs')
-    
-    if len(combDF.columns) == expected_ncols_out:
-        print('\nPASS: Combined df has expected ncols')
-    else:
-        sys.exit('\nFAIL: Length mismatch for combined_df')
-        
-    print('\nAdding column: Model_name')
-    
-    combDF['Model_name'] = combDF.index
-    
-    print('\n========================================================='
-          , '\nSUCCESS: Ran multiple classifiers'
-          , '\n=======================================================')
-
-    #resampling_methods_wf = combined_baseline_wf[['resampling']]
-    #resampling_methods_wf = resampling_methods_wf.drop_duplicates()
-              #, '\n', resampling_methods_wf)
-
-    return combDF
-
-###############################################################################
-#%% Feature selection function ################################################
-############################
-# fsgs_rfecv() 
-############################
-# Run FS using some classifier models
-# 
-def fsgs_rfecv(input_df
-         , target
-         , param_gridLd = [{'fs__min_features_to_select' : [1]}]
-         , blind_test_df = pd.DataFrame()
-         , blind_test_target = pd.Series(dtype = 'int64')
-         , estimator = LogisticRegression(**rs) # placeholder
-         , use_fs = False # uses estimator as the RFECV parameter for fs. Set to TRUE if you want to supply custom_fs as shown below
-         , custom_fs = RFECV(DecisionTreeClassifier(**rs) , cv =  skf_cv, scoring = 'matthews_corrcoef')
-         , cv_method =  skf_cv
-         , var_type = ['numerical', 'categorical' , 'mixed']
-         , verbose = 3
-         ):
-    '''
-    returns
-    Dict containing results from FS and hyperparam tuning for a given estiamtor
-    
-    >>> ADD MORE <<<
-    
-    optimised/selected based on mcc
-    
-    '''
-    ###########################################################################
-    #================================================
-    # Determine categorical and numerical features
-    #================================================
-    numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
-    numerical_ix
-    categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns
-    categorical_ix    
-    
-    #================================================
-    # Determine preprocessing steps ~ var_type
-    #================================================
-    if var_type == 'numerical':
-        t = [('num', MinMaxScaler(), numerical_ix)]
-    
-    if var_type == 'categorical':
-        t = [('cat', OneHotEncoder(), categorical_ix)]
-    
-    if var_type == 'mixed':
-        t = [('cat', OneHotEncoder(), categorical_ix)
-              , ('num', MinMaxScaler(), numerical_ix)]
-        
-    col_transform = ColumnTransformer(transformers = t
-                                        , remainder='passthrough')
-    
-    ###########################################################################
-    #==================================================
-    # Create var_type ~ column names
-    # using one hot encoder with RFECV means 
-    # the names internally are lost. Hence
-    # fit col_transformeer to my input_df and get 
-    # all the column names out and stored in a var
-    # to allow the 'selected features' to be subsetted
-    # from the numpy boolean array
-    #=================================================
-    col_transform.fit(input_df)
-    col_transform.get_feature_names_out()
-    
-    var_type_colnames = col_transform.get_feature_names_out()
-    var_type_colnames = pd.Index(var_type_colnames)
-    
-    if var_type == 'mixed':
-        print('\nVariable type is:', var_type
-              , '\nNo. of columns in input_df:', len(input_df.columns)
-              , '\nNo. of columns post one hot encoder:', len(var_type_colnames))
-    else:
-        print('\nNo. of columns in input_df:', len(input_df.columns))
-        
-    #==================================
-    # Build FS with supplied estimator
-    #==================================
-    if use_fs:
-        fs = custom_fs
-    else:
-        fs = RFECV(estimator, cv = skf_cv, scoring = 'matthews_corrcoef')
-    
-    #==================================
-    # Build basic param grid
-    #==================================
-    # param_gridD = [
-    #     {'fs__min_features_to_select' : [1] 
-    #       }]
-        
-    ############################################################################   
-    # Create Pipeline object
-    pipe = Pipeline([
-        ('pre', col_transform),
-        ('fs', fs),
-        ('clf', estimator)])
-    ############################################################################   
-    # Define GridSearchCV
-    gscv_fs = GridSearchCV(pipe
-                           #, param_gridLd = param_gridD
-                           , param_gridLd
-                           , cv = cv_method
-                           , scoring = scoring_fn
-                           , refit = 'mcc'
-                           , verbose = 3
-                           , return_train_score = True
-                           , **njobs)
-    
-    gscv_fs.fit(input_df, target)
-    
-    ###########################################################################
-    # Get best param and scores out
-    gscv_fs.best_params_
-    gscv_fs.best_score_
-    
-    # Training best score corresponds to the max of the mean_test<score>
-    train_bscore = round(gscv_fs.best_score_, 2); train_bscore
-    print('\nTraining best score (MCC):', train_bscore)
-    gscv_fs.cv_results_['mean_test_mcc']
-    round(gscv_fs.cv_results_['mean_test_mcc'].max(),2)
-    round(np.nanmax(gscv_fs.cv_results_['mean_test_mcc']),2)
-    
-    check_train_score = [round(gscv_fs.cv_results_['mean_test_mcc'].max(),2)
-                        , round(np.nanmax(gscv_fs.cv_results_['mean_test_mcc']),2)]
-    
-    check_train_score = np.nanmax(check_train_score)
-    
-    # Training results
-    gscv_tr_resD = gscv_fs.cv_results_
-    mod_refit_param =  gscv_fs.refit
-    
-    # sanity check
-    if train_bscore == check_train_score:
-        print('\nVerified training score (MCC):', train_bscore )
-    else:
-        sys.exit('\nTraining score could not be internatlly verified. Please check training results dict')
-        
-    #-------------------------
-    # Dict of CV results
-    #-------------------------
-    cv_allD = gscv_fs.cv_results_
-    cvdf0   = pd.DataFrame(cv_allD)
-    cvdf    = cvdf0.filter(regex='mean_test', axis = 1)
-    cvdfT   = cvdf.T
-    cvdfT.columns = ['cv_score']
-    cvdfTr = cvdfT.loc[:,'cv_score'].round(decimals = 2) # round values
-    cvD     = cvdfTr.to_dict()
-    print('\n CV results dict generated for:', len(scoring_fn), 'scores'
-          , '\nThese are:', scoring_fn.keys())
-        
-    #-------------------------
-    # Blind test: REAL check!
-    #-------------------------
-    #tp = gscv_fs.predict(X_bts)
-    tp = gscv_fs.predict(blind_test_df)
-
-    print('\nMCC on Blind test:'     , round(matthews_corrcoef(blind_test_target, tp),2))
-    print('\nAccuracy on Blind test:', round(accuracy_score(blind_test_target, tp),2))
-    
-    #=================
-    # info extraction
-    #=================
-    # gives input vals??
-    gscv_fs._check_n_features
-    
-    # gives gscv params used
-    gscv_fs._get_param_names()
-    
-    # gives ??
-    gscv_fs.best_estimator_
-    gscv_fs.best_params_ # gives best estimator params as a dict 
-    gscv_fs.best_estimator_._final_estimator # similar to above, doesn't contain max_iter
-    gscv_fs.best_estimator_.named_steps['fs'].get_support()
-    gscv_fs.best_estimator_.named_steps['fs'].ranking_ # array of ranks for the features
-    
-    gscv_fs.best_estimator_.named_steps['fs'].grid_scores_.mean()
-    gscv_fs.best_estimator_.named_steps['fs'].grid_scores_.max()
-    #gscv_fs.best_estimator_.named_steps['fs'].grid_scores_
-    
-    estimator_mask = gscv_fs.best_estimator_.named_steps['fs'].get_support()
-
-    
-    ############################################################################
-    #============
-    # FS results
-    #============
-    # Now get the features out
-    
-    #--------------
-    # All features 
-    #--------------
-    all_features = gscv_fs.feature_names_in_
-    n_all_features =  gscv_fs.n_features_in_
-    #all_features = gsfit.feature_names_in_
-    
-    #--------------
-    # Selected features by the classifier
-    # Important to have var_type_colnames here
-    #----------------
-    #sel_features = X.columns[gscv_fs.best_estimator_.named_steps['fs'].get_support()] 3 only for numerical df    
-    sel_features = var_type_colnames[gscv_fs.best_estimator_.named_steps['fs'].get_support()]
-    n_sf = gscv_fs.best_estimator_.named_steps['fs'].n_features_
-    
-    #--------------
-    # Get model name
-    #--------------
-    model_name     = gscv_fs.best_estimator_.named_steps['clf']
-    b_model_params = gscv_fs.best_params_
-    
-    print('\n========================================'
-          , '\nRunning model:'
-          , '\nModel name:', model_name
-          , '\n==============================================='
-          , '\nRunning feature selection with RFECV for model'
-          , '\nTotal no. of features in model:', len(all_features)
-          , '\nThese are:\n',  all_features, '\n\n'
-          , '\nNo of features for best model: ', n_sf
-          , '\nThese are:', sel_features, '\n\n'
-          , '\nBest Model hyperparams:', b_model_params
-          )
-    
-    ###########################################################################
-    ############################## OUTPUT #####################################
-    ###########################################################################
-    #=========================
-    # Blind test: BTS results
-    #=========================
-    # Build the final results with all scores for a feature selected model
-    #bts_predict = gscv_fs.predict(X_bts)
-    bts_predict = gscv_fs.predict(blind_test_df)
-
-    print('\nMCC on Blind test:'     , round(matthews_corrcoef(blind_test_target, bts_predict),2))
-    print('\nAccuracy on Blind test:', round(accuracy_score(blind_test_target, bts_predict),2))
-    bts_mcc_score = round(matthews_corrcoef(blind_test_target, bts_predict),2)
-    
-    # Diff b/w train and bts test scores
-    train_test_diff = train_bscore - bts_mcc_score
-    print('\nDiff b/w train and blind test score (MCC):', train_test_diff)
-    
-    lr_btsD ={}
-    #lr_btsD['bts_mcc']       = bts_mcc_score
-    lr_btsD['bts_fscore']    = round(f1_score(blind_test_target, bts_predict),2)
-    lr_btsD['bts_precision'] = round(precision_score(blind_test_target, bts_predict),2)
-    lr_btsD['bts_recall']    = round(recall_score(blind_test_target, bts_predict),2)
-    lr_btsD['bts_accuracy']  = round(accuracy_score(blind_test_target, bts_predict),2)
-    lr_btsD['bts_roc_auc']   = round(roc_auc_score(blind_test_target, bts_predict),2)
-    lr_btsD['bts_jcc']       = round(jaccard_score(blind_test_target, bts_predict),2)
-    lr_btsD
-    
-    #===========================
-    # Add FS related model info
-    #===========================
-    model_namef = str(model_name)
-    # FIXME: doesn't tell you which it has chosen
-    fs_methodf = str(gscv_fs.best_estimator_.named_steps['fs'])
-    all_featuresL = list(all_features)
-    fs_res_arrayf = str(list( gscv_fs.best_estimator_.named_steps['fs'].get_support()))
-    fs_res_array_rankf = str(list( gscv_fs.best_estimator_.named_steps['fs'].ranking_))
-    sel_featuresf = list(sel_features)
-    n_sf = int(n_sf)
-    
-    output_modelD = {'model_name': model_namef
-                     , 'model_refit_param': mod_refit_param
-                     , 'Best_model_params': b_model_params 
-                     , 'n_all_features': n_all_features
-                     , 'fs_method': fs_methodf
-                     , 'fs_res_array': fs_res_arrayf
-                     , 'fs_res_array_rank': fs_res_array_rankf
-                     , 'all_feature_names': all_featuresL
-                     , 'n_sel_features': n_sf
-                     , 'sel_features_names': sel_featuresf}
-    #output_modelD
-    
-    #========================================
-    # Update output_modelD with bts_results
-    #========================================
-    output_modelD.update(lr_btsD)
-    output_modelD
-    
-    output_modelD['train_score (MCC)'] = train_bscore
-    output_modelD['bts_mcc'] = bts_mcc_score
-    output_modelD['train_bts_diff'] = round(train_test_diff,2)
-    print(output_modelD)
-    
-    nlen = len(output_modelD)
-    
-    #========================================
-    # Update output_modelD with cv_results
-    #========================================
-    output_modelD.update(cvD)
-    
-    if (len(output_modelD) == nlen + len(cvD)):
-        print('\nFS run complete for model:', estimator
-              , '\nFS using:', fs
-              , '\nOutput dict size:', len(output_modelD))
-        return(output_modelD)
-    else:
-        sys.exit('\nFAIL:numbers mismatch output dict length not as expected. Please check')
-    
--- a/scripts/ml/log_FS_pnca_7030.txt
+++ b/scripts/ml/log_FS_pnca_7030.txt
--- a/scripts/ml/ml_data.py
+++ b/scripts/ml/ml_data.py
@ -1,730 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Sun Mar  6 13:41:54 2022
-
-@author: tanu
-"""
-#def setvars(gene,drug):
-#https://stackoverflow.com/questions/51695322/compare-multiple-algorithms-with-sklearn-pipeline
-import os, sys
-import pandas as pd
-import numpy as np
-print(np.__version__)
-print(pd.__version__)
-import pprint as pp
-from copy import deepcopy
-from collections import Counter
-from sklearn.impute import KNNImputer as KNN
-from imblearn.over_sampling import RandomOverSampler
-from imblearn.under_sampling import RandomUnderSampler
-from imblearn.over_sampling import SMOTE
-from sklearn.datasets import make_classification
-from imblearn.combine import SMOTEENN
-from imblearn.combine import SMOTETomek
-
-from imblearn.over_sampling import SMOTENC
-from imblearn.under_sampling import EditedNearestNeighbours
-from imblearn.under_sampling import RepeatedEditedNearestNeighbours
-
-from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score
-from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report
-
-from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
-from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold
-
-from sklearn.pipeline import Pipeline, make_pipeline
-#%% GLOBALS
-rs = {'random_state': 42}
-njobs = {'n_jobs': 10}
-
-scoring_fn =  ({ 'mcc'         : make_scorer(matthews_corrcoef)
-                , 'accuracy'   : make_scorer(accuracy_score)
-                , 'fscore'     : make_scorer(f1_score)
-                , 'precision'  : make_scorer(precision_score)
-                , 'recall'     : make_scorer(recall_score)
-                , 'roc_auc'    : make_scorer(roc_auc_score)
-                , 'jcc'        : make_scorer(jaccard_score)
-            }) 
-  
-skf_cv = StratifiedKFold(n_splits = 10
-                          #, shuffle = False, random_state= None)
-                           , shuffle = True,**rs)
-
-rskf_cv = RepeatedStratifiedKFold(n_splits = 10
-                                  , n_repeats = 3
-                                  , **rs)
-
-mcc_score_fn  = {'mcc': make_scorer(matthews_corrcoef)}
-jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
-
-#%% FOR LATER: Combine ED logo data
-#%% DONE: active aa site annotations **DONE on 15/05/2022 as part of generating merged_dfs
-###########################################################################
-rs = {'random_state': 42}
-njobs = {'n_jobs': 10}
-homedir = os.path.expanduser("~")
-
-geneL_basic     = ['pnca']
-geneL_na        = ['gid']
-geneL_na_ppi2   = ['rpob']
-geneL_ppi2      = ['alr', 'embb', 'katg']
-
-#num_type = ['int64', 'float64']
-num_type = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
-cat_type = ['object', 'bool']
-
-#==============
-# directories
-#==============
-datadir = homedir + '/git/Data/'
-indir   = datadir + drug + '/input/'
-outdir  = datadir + drug + '/output/'
-
-#=======
-# input
-#=======
-
-#---------
-# File 1
-#---------
-infile_ml1 = outdir + gene.lower() + '_merged_df3.csv' 
-#infile_ml2 = outdir + gene.lower() + '_merged_df2.csv'
-
-my_features_df = pd.read_csv(infile_ml1, index_col = 0) 
-my_features_df  = my_features_df .reset_index(drop = True)
-my_features_df.index
-
-my_features_df.dtypes
-mycols = my_features_df.columns
-
-#---------
-# File 2
-#---------
-infile_aaindex = outdir + 'aa_index/' + gene.lower() + '_aa.csv' 
-aaindex_df = pd.read_csv(infile_aaindex, index_col = 0) 
-aaindex_df.dtypes
-
-#-----------
-# check for non-numerical columns
-#-----------
-if any(aaindex_df.dtypes==object):
-    print('\naaindex_df contains non-numerical data')
-
-aaindex_df_object = aaindex_df.select_dtypes(include = cat_type)
-print('\nTotal no. of non-numerial columns:', len(aaindex_df_object.columns))
-
-expected_aa_ncols = len(aaindex_df.columns) - len(aaindex_df_object.columns)
-
-#-----------
-# Extract numerical data only
-#-----------
-print('\nSelecting numerical data only')
-aaindex_df = aaindex_df.select_dtypes(include = num_type)
-
-#---------------------------
-# aaindex: sanity check 1
-#---------------------------
-if len(aaindex_df.columns) == expected_aa_ncols:
-    print('\nPASS: successfully selected numerical columns only for aaindex_df')
-else:
-    print('\nFAIL: Numbers mismatch'
-          , '\nExpected ncols:', expected_aa_ncols
-          , '\nGot:', len(aaindex_df.columns))    
-    
-#---------------
-# check for NA
-#---------------
-print('\nNow checking for NA in the remaining aaindex_cols')
-c1 = aaindex_df.isna().sum()
-c2 = c1.sort_values(ascending=False)
-print('\nCounting aaindex_df cols with NA'
-      , '\nncols with NA:', sum(c2>0), 'columns'
-      , '\nDropping these...'
-      , '\nOriginal ncols:', len(aaindex_df.columns)
-      )
-aa_df = aaindex_df.dropna(axis=1)
-
-print('\nRevised df ncols:', len(aa_df.columns))
-
-c3 = aa_df.isna().sum()
-c4 = c3.sort_values(ascending=False)
-
-print('\nChecking NA in revised df...')
-
-if sum(c4>0):
-    sys.exit('\nFAIL: aaindex_df still contains cols with NA, please check and drop these before proceeding...')
-else:
-    print('\nPASS: cols with NA successfully dropped from aaindex_df'
-          , '\nProceeding with combining aa_df with other features_df')
-    
-#---------------------------
-# aaindex: sanity check 2
-#---------------------------
-expected_aa_ncols2 =  len(aaindex_df.columns) - sum(c2>0)  
-if len(aa_df.columns) == expected_aa_ncols2:
-    print('\nPASS: ncols match'
-          , '\nExpected ncols:', expected_aa_ncols2
-          , '\nGot:', len(aa_df.columns))
-else:
-    print('\nFAIL: Numbers mismatch'
-          , '\nExpected ncols:', expected_aa_ncols2
-          , '\nGot:', len(aa_df.columns))            
-    
-# Important: need this to identify aaindex cols    
-aa_df_cols = aa_df.columns
-print('\nTotal no. of columns in clean aa_df:', len(aa_df_cols))
-
-###############################################################################
-#%% Combining my_features_df and aaindex_df
-#===========================
-# Merge my_df + aaindex_df
-#===========================
-
-if aa_df.columns[aa_df.columns.isin(my_features_df.columns)] == my_features_df.columns[my_features_df.columns.isin(aa_df.columns)]:
-    print('\nMerging on column: mutationinformation')   
-
-if len(my_features_df) == len(aa_df):
-    expected_nrows = len(my_features_df)
-    print('\nProceeding to merge, expected nrows in merged_df:', expected_nrows)
-else:
-    sys.exit('\nNrows mismatch, cannot merge. Please check'
-          , '\nnrows my_df:', len(my_features_df)
-          , '\nnrows aa_df:', len(aa_df))
-           
-#-----------------
-# Reset index: mutationinformation
-# Very important for merging
-#-----------------
-aa_df = aa_df.reset_index()
-
-expected_ncols = len(my_features_df.columns) + len(aa_df.columns) - 1 # for the no. of merging col
-
-#-----------------
-# Merge: my_features_df + aa_df
-#-----------------
-merged_df = pd.merge(my_features_df
-                     , aa_df
-                     , on = 'mutationinformation')
-
-#---------------------------
-# aaindex: sanity check 3
-#---------------------------
-if len(merged_df.columns) == expected_ncols:
-    print('\nPASS: my_features_df and aa_df successfully combined'
-          , '\nnrows:', len(merged_df)
-          , '\nncols:', len(merged_df.columns))
-else:
-    sys.exit('\nFAIL: could not combine my_features_df and aa_df'
-             , '\nCheck dims and merging cols!')
-    
-#--------
-# Reassign so downstream code doesn't need to change
-#--------
-my_df = merged_df.copy()
-
-#%% Data: my_df
-# Check if non structural pos have crept in
-# IDEALLY remove from source! But for rpoB do it here
-# Drop NA where numerical cols have them
-if gene.lower() in geneL_na_ppi2:
-    #D1148 get rid of
-    na_index = my_df['mutationinformation'].index[my_df['mcsm_na_affinity'].apply(np.isnan)]
-    my_df = my_df.drop(index=na_index)
-
-# FIXED: complete data for all muts inc L114M, F115L, V123L, V125I, V131M
-# if gene.lower() in ['embb']:
-#     na_index = my_df['mutationinformation'].index[my_df['ligand_distance'].apply(np.isnan)]
-#     my_df = my_df.drop(index=na_index)
-
-# # Sanity check for non-structural positions
-# print('\nChecking for non-structural postions')
-# na_index = my_df['mutationinformation'].index[my_df['ligand_distance'].apply(np.isnan)]
-# if len(na_index) > 0:
-#     print('\nNon-structural positions detected for gene:', gene.lower()
-#           , '\nTotal number of these detected:', len(na_index)
-#           , '\These are at index:', na_index
-#           , '\nOriginal nrows:', len(my_df)
-#           , '\nDropping these...')
-#     my_df = my_df.drop(index=na_index)
-#     print('\nRevised nrows:', len(my_df))
-# else:
-#     print('\nNo non-structural positions detected for gene:', gene.lower()
-#           , '\nnrows:', len(my_df))
-          
-
-###########################################################################
-#%% Add lineage calculation columns
-#FIXME: Check if this can be imported from config?
-total_mtblineage_uc = 8
-lineage_colnames = ['lineage_list_all', 'lineage_count_all', 'lineage_count_unique', 'lineage_list_unique', 'lineage_multimode']
-#bar = my_df[lineage_colnames]
-my_df['lineage_proportion']      = my_df['lineage_count_unique']/my_df['lineage_count_all']
-my_df['dist_lineage_proportion'] = my_df['lineage_count_unique']/total_mtblineage_uc
-###########################################################################
-#%% Active site annotation column
-# change from numberic to categorical
-
-if my_df['active_site'].dtype in num_type:
-    my_df['active_site'] = my_df['active_site'].astype(object)
-    my_df['active_site'].dtype
-#%% AA property change
-#--------------------
-# Water prop change
-#--------------------
-my_df['water_change'] = my_df['wt_prop_water'] + str('_to_') + my_df['mut_prop_water']
-my_df['water_change'].value_counts()
-
-water_prop_changeD = {
-    'hydrophobic_to_neutral'          : 'change'
-    , 'hydrophobic_to_hydrophobic'    : 'no_change'
-    , 'neutral_to_neutral'            : 'no_change'
-    , 'neutral_to_hydrophobic'        : 'change'
-    , 'hydrophobic_to_hydrophilic'    : 'change'
-    , 'neutral_to_hydrophilic'        : 'change'
-    , 'hydrophilic_to_neutral'        : 'change'
-    , 'hydrophilic_to_hydrophobic'    : 'change'
-    , 'hydrophilic_to_hydrophilic'    : 'no_change'
-}
-
-my_df['water_change'] = my_df['water_change'].map(water_prop_changeD)
-my_df['water_change'].value_counts()
-
-#--------------------
-# Polarity change
-#--------------------
-my_df['polarity_change'] = my_df['wt_prop_polarity'] + str('_to_') + my_df['mut_prop_polarity']
-my_df['polarity_change'].value_counts()
-
-polarity_prop_changeD = {
-    'non-polar_to_non-polar'     : 'no_change'
-    , 'non-polar_to_neutral'     : 'change'  
-    , 'neutral_to_non-polar'     : 'change'  
-    , 'neutral_to_neutral'       : 'no_change'  
-    , 'non-polar_to_basic'       : 'change'  
-    , 'acidic_to_neutral'        : 'change'  
-    , 'basic_to_neutral'         : 'change'  
-    , 'non-polar_to_acidic'      : 'change'  
-    , 'neutral_to_basic'         : 'change'  
-    , 'acidic_to_non-polar'      : 'change'  
-    , 'basic_to_non-polar'       : 'change'
-    , 'neutral_to_acidic'        : 'change'
-    , 'acidic_to_acidic'         : 'no_change'
-    , 'basic_to_acidic'          : 'change'
-    , 'basic_to_basic'           : 'no_change'
-    , 'acidic_to_basic'          : 'change'}
-
-my_df['polarity_change'] = my_df['polarity_change'].map(polarity_prop_changeD)
-my_df['polarity_change'].value_counts()
-
-#--------------------
-# Electrostatics change
-#--------------------
-my_df['electrostatics_change'] = my_df['wt_calcprop'] + str('_to_') + my_df['mut_calcprop']
-my_df['electrostatics_change'].value_counts()
-
-calc_prop_changeD = {
-        'non-polar_to_non-polar'     : 'no_change'
-        , 'non-polar_to_polar'       : 'change'
-        , 'polar_to_non-polar'       : 'change'
-        , 'non-polar_to_pos'         : 'change'
-        , 'neg_to_non-polar'         : 'change'
-        , 'non-polar_to_neg'         : 'change'
-        , 'pos_to_polar'             : 'change'
-        , 'pos_to_non-polar'         : 'change'
-        , 'polar_to_polar'           : 'no_change'
-        , 'neg_to_neg'               : 'no_change'
-        , 'polar_to_neg'             : 'change'
-        , 'pos_to_neg'               : 'change'
-        , 'pos_to_pos'               : 'no_change'
-        , 'polar_to_pos'             : 'change'
-        , 'neg_to_polar'             : 'change'
-        , 'neg_to_pos'               : 'change'
-}
-
-my_df['electrostatics_change'] = my_df['electrostatics_change'].map(calc_prop_changeD)
-my_df['electrostatics_change'].value_counts()
-
-#--------------------    
-# Summary change: Create a combined column summarising these three cols
-#--------------------
-detect_change = 'change'
-check_prop_cols = ['water_change', 'polarity_change', 'electrostatics_change']
-#my_df['aa_prop_change'] = (my_df.values == detect_change).any(1).astype(int)
-my_df['aa_prop_change'] = (my_df[check_prop_cols].values == detect_change).any(1).astype(int)
-my_df['aa_prop_change'].value_counts()
-my_df['aa_prop_change'].dtype
-
-my_df['aa_prop_change'] = my_df['aa_prop_change'].map({1:'change'
-                                                       , 0: 'no_change'})
-
-my_df['aa_prop_change'].value_counts()
-my_df['aa_prop_change'].dtype
-
-#%% IMPUTE values for OR [check script for exploration: UQ_or_imputer]
-#--------------------
-# Impute OR values
-#--------------------
-#or_cols = ['or_mychisq', 'log10_or_mychisq', 'or_fisher']
-sel_cols = ['mutationinformation', 'or_mychisq', 'log10_or_mychisq']
-or_cols = ['or_mychisq', 'log10_or_mychisq']
-
-print("count of NULL values before imputation\n")
-print(my_df[or_cols].isnull().sum())
-
-my_dfI = pd.DataFrame(index = my_df['mutationinformation'] )
-
-    
-my_dfI = pd.DataFrame(KNN(n_neighbors=3, weights="uniform").fit_transform(my_df[or_cols])
-                      , index =  my_df['mutationinformation']
-                      , columns = or_cols )
-my_dfI.columns = ['or_rawI', 'logorI']
-my_dfI.columns
-my_dfI = my_dfI.reset_index(drop = False) # prevents old index from being added as a column
-my_dfI.head()
-print("count of NULL values AFTER imputation\n")
-print(my_dfI.isnull().sum())
-
-#-------------------------------------------
-# OR df Merge: with original based on index
-#-------------------------------------------
-#my_df['index_bm'] = my_df.index
-mydf_imputed = pd.merge(my_df
-                    , my_dfI
-                    , on = 'mutationinformation')
-#mydf_imputed = mydf_imputed.set_index(['index_bm'])
-
-my_df['log10_or_mychisq'].isna().sum()
-mydf_imputed['log10_or_mychisq'].isna().sum()
-mydf_imputed['logorI'].isna().sum() # should be 0
-
-len(my_df.columns)
-len(mydf_imputed.columns)  
-
-#-----------------------------------------
-# REASSIGN my_df after imputing OR values
-#-----------------------------------------
-my_df = mydf_imputed.copy()
-
-if my_df['logorI'].isna().sum() == 0:
-    print('\nPASS: OR values imputed, data ready for ML')
-else:
-    sys.exit('\nFAIL: something went wrong, Data not ready for ML. Please check upstream!')
-
-#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-#---------------------------------------
-# TODO: try other imputation like MICE
-#---------------------------------------
-#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-
-#%%########################################################################
-#==========================
-#     Data for ML
-#==========================
-my_df_ml = my_df.copy()
-
-#%% Build X: input for ML
-common_cols_stabiltyN = ['ligand_distance'
-           , 'ligand_affinity_change'
-           , 'duet_stability_change'
-           , 'ddg_foldx'
-           , 'deepddg'
-           , 'ddg_dynamut2'
-           , 'mmcsm_lig'
-           , 'contacts']
-
-# Build stability columns ~ gene
-if gene.lower() in geneL_basic:
-    X_stabilityN = common_cols_stabiltyN
-    cols_to_mask = ['ligand_affinity_change']
-    
-if gene.lower() in geneL_ppi2:
-#    X_stabilityN = common_cols_stabiltyN + ['mcsm_ppi2_affinity' , 'interface_dist'] 
-    geneL_ppi2_st_cols = ['mcsm_ppi2_affinity', 'interface_dist'] 
-    X_stabilityN = common_cols_stabiltyN + geneL_ppi2_st_cols
-    cols_to_mask = ['ligand_affinity_change', 'mcsm_ppi2_affinity']
-
-if gene.lower() in geneL_na:
-#    X_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] 
-    geneL_na_st_cols =  ['mcsm_na_affinity'] 
-    X_stabilityN = common_cols_stabiltyN + geneL_na_st_cols
-    cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity']
-
-if gene.lower() in geneL_na_ppi2:
-#    X_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] 
-    geneL_na_ppi2_st_cols = ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] 
-    X_stabilityN = common_cols_stabiltyN + geneL_na_ppi2_st_cols
-    cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity', 'mcsm_ppi2_affinity']
-
-
-X_foldX_cols = [ 'electro_rr', 'electro_mm', 'electro_sm', 'electro_ss'
-, 'disulfide_rr', 'disulfide_mm', 'disulfide_sm', 'disulfide_ss'
-, 'hbonds_rr', 'hbonds_mm', 'hbonds_sm', 'hbonds_ss'
-, 'partcov_rr', 'partcov_mm', 'partcov_sm', 'partcov_ss'
-, 'vdwclashes_rr', 'vdwclashes_mm', 'vdwclashes_sm', 'vdwclashes_ss'
-, 'volumetric_rr', 'volumetric_mm', 'volumetric_ss'
-]
-
-X_str =  ['rsa'
-           #, 'asa'
-           , 'kd_values'
-           , 'rd_values']    
-
-X_ssFN = X_stabilityN + X_str + X_foldX_cols
-
-X_evolFN =  ['consurf_score'
-           , 'snap2_score'
-           , 'provean_score']
-    
-X_genomic_mafor =  ['maf'
-                , 'logorI'
-                # , 'or_rawI'
-                # , 'or_mychisq'
-                # , 'or_logistic'
-                # , 'or_fisher'
-                # , 'pval_fisher'
-                ]
-
-X_genomic_linegae  = ['lineage_proportion'
-                      , 'dist_lineage_proportion'
-                      #, 'lineage' # could be included as a category but it has L2;L4  formatting
-                      , 'lineage_count_all'
-                      , 'lineage_count_unique'
-                      ]
-
-X_genomicFN = X_genomic_mafor + X_genomic_linegae
-
-#X_aaindexFN = list(aa_df_cols)
-
-#print('\nTotal no. of features for aaindex:', len(X_aaindexFN))
-
-# numerical feature names [NO aa_index]
-numerical_FN = X_ssFN  + X_evolFN + X_genomicFN
-
-
-# categorical feature names
-categorical_FN = ['ss_class'
-            # , 'wt_prop_water'
-            # , 'mut_prop_water'
-            # , 'wt_prop_polarity'
-            # , 'mut_prop_polarity'
-            # , 'wt_calcprop'
-            # , 'mut_calcprop'
-            , 'aa_prop_change'
-            , 'electrostatics_change'
-            , 'polarity_change'
-            , 'water_change'
-            , 'drtype_mode_labels' # beware then you can't use it to predict [USED it for uq_v1, not v2]
-            , 'active_site' #[didn't use it for uq_v1]
-            #, 'gene_name' # will be required for the combined stuff
-             ]
-#----------------------------------------------
-# count numerical and categorical features
-#----------------------------------------------
-
-print('\nNo. of numerical features:', len(numerical_FN)
-      , '\nNo. of categorical features:', len(categorical_FN))
-
-###########################################################################
-#=======================
-# Masking columns:
-# (mCSM-lig, mCSM-NA, mCSM-ppi2) values for lig_dist >10
-#=======================
-# my_df_ml['mutationinformation'][my_df['ligand_distance']>10].value_counts()
-# my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts()
-
-# my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), 'ligand_affinity_change'] = 0
-# (my_df_ml['ligand_affinity_change'] == 0).sum()
-
-my_df_ml['mutationinformation'][my_df_ml['ligand_distance']>10].value_counts()
-my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts()
-my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask].value_counts()
-
-# mask the mcsm affinity related columns where ligand distance > 10
-my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask] = 0
-(my_df_ml['ligand_affinity_change'] == 0).sum()
-
-mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask]  
-
-# write file for check
-mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
-mask_check.to_csv(outdir + 'ml/' + gene.lower() + '_mask_check.csv')
-
-#===================================================
-# Training and BLIND test set [UQ]: actual vs imputed
-# No aa index but active_site included
-# dst with actual values  : training set
-# dst with imputed values : blind test
-#==================================================
-my_df_ml[drug].isna().sum()  #'na' ones are the blind_test set
-
-blind_test_df = my_df_ml[my_df_ml[drug].isna()]
-blind_test_df.shape
-
-training_df = my_df_ml[my_df_ml[drug].notna()]
-training_df.shape
-
-# Target 1: dst_mode
-training_df[drug].value_counts()
-training_df['dst_mode'].value_counts()
-####################################################################
-#%% extracting dfs based on numerical, categorical column names
-#----------------------------------
-# WITHOUT the target var included
-#----------------------------------
-num_df = training_df[numerical_FN]
-num_df.shape
-
-cat_df = training_df[categorical_FN]
-cat_df.shape
-
-all_df = training_df[numerical_FN + categorical_FN]
-all_df.shape
-
-#------------------------------
-# WITH the target var included:
-    #'wtgt': with target
-#------------------------------
-# drug and dst_mode should be the same thing
-num_df_wtgt = training_df[numerical_FN + ['dst_mode']]
-num_df_wtgt.shape
-
-cat_df_wtgt = training_df[categorical_FN + ['dst_mode']]
-cat_df_wtgt.shape
-
-all_df_wtgt = training_df[numerical_FN + categorical_FN + ['dst_mode']]
-all_df_wtgt.shape
-#%%########################################################################
-#============
-# ML data
-#============
-#------
-# X: Training and Blind test (BTS)
-#------
-X     = all_df_wtgt[numerical_FN + categorical_FN] # training data ALL
-X_bts = blind_test_df[numerical_FN + categorical_FN] # blind test data ALL
-#X = all_df_wtgt[numerical_FN] # training numerical only
-#X_bts = blind_test_df[numerical_FN] # blind test data numerical
-
-#------
-# y
-#------
-y = all_df_wtgt['dst_mode'] # training data y
-y_bts = blind_test_df['dst_mode'] # blind data test y
-
-#X_bts_wt = blind_test_df[numerical_FN + ['dst_mode']] 
-
-# Quick check
-#(X['ligand_affinity_change']==0).sum() == (X['ligand_distance']>10).sum()
-for i in range(len(cols_to_mask)):
-    ind = i+1
-    print('\nindex:', i, '\nind:', ind)
-    print('\nMask count check:'
-          , (my_df_ml[cols_to_mask[i]]==0).sum() == (my_df_ml['ligand_distance']>10).sum()
-          )
-
-print('Original Data\n', Counter(y)
-      , 'Data dim:', X.shape)
-
-yc1 = Counter(y)
-yc1_ratio = yc1[0]/yc1[1]
-
-yc2 = Counter(y_bts)
-yc2_ratio = yc2[0]/yc2[1]
-
-print('\n-------------------------------------------------------------'
-      , '\nSuccessfully split data: UQ [no aa_index but active site included] training'
-      , '\nactual values: training set'
-      , '\nimputed values: blind test set'
-      , '\nTrain data size:', X.shape
-      , '\nTest data size:', X_bts.shape
-      , '\ny_train numbers:', yc1
-      , '\ny_train ratio:',yc1_ratio
-      , '\n'
-      , '\ny_test_numbers:', yc2
-      , '\ny_test ratio:', yc2_ratio
-      , '\n-------------------------------------------------------------'
-      )
-###########################################################################
-#%% 
-###########################################################################
-#                               RESAMPLING
-###########################################################################
-#------------------------------
-# Simple Random oversampling
-# [Numerical + catgeorical]
-#------------------------------
-oversample = RandomOverSampler(sampling_strategy='minority')
-X_ros, y_ros = oversample.fit_resample(X, y)
-print('Simple Random OverSampling\n', Counter(y_ros))
-print(X_ros.shape)
-
-#------------------------------
-# Simple Random Undersampling
-# [Numerical + catgeorical]
-#------------------------------
-undersample = RandomUnderSampler(sampling_strategy='majority')
-X_rus, y_rus = undersample.fit_resample(X, y)
-print('Simple Random UnderSampling\n', Counter(y_rus))
-print(X_rus.shape)
-
-#------------------------------
-# Simple combine ROS and RUS
-# [Numerical + catgeorical]
-#------------------------------
-oversample = RandomOverSampler(sampling_strategy='minority')
-X_ros, y_ros = oversample.fit_resample(X, y)
-undersample = RandomUnderSampler(sampling_strategy='majority')
-X_rouC, y_rouC = undersample.fit_resample(X_ros, y_ros)
-print('Simple Combined Over and UnderSampling\n',  Counter(y_rouC))
-print(X_rouC.shape)
-
-#------------------------------
-# SMOTE_NC: oversampling 
-# [numerical + categorical]
-#https://stackoverflow.com/questions/47655813/oversampling-smote-for-binary-and-categorical-data-in-python
-#------------------------------
-# Determine categorical and numerical features
-numerical_ix = X.select_dtypes(include=['int64', 'float64']).columns
-numerical_ix
-num_featuresL = list(numerical_ix)
-numerical_colind = X.columns.get_indexer(list(numerical_ix) )
-numerical_colind
-
-categorical_ix = X.select_dtypes(include=['object', 'bool']).columns
-categorical_ix    
-categorical_colind = X.columns.get_indexer(list(categorical_ix))
-categorical_colind
-
-k_sm = 5 # 5 is deafult
-sm_nc = SMOTENC(categorical_features=categorical_colind, k_neighbors = k_sm, **rs, **njobs)
-X_smnc, y_smnc = sm_nc.fit_resample(X, y)
-print('SMOTE_NC OverSampling\n', Counter(y_smnc))
-print(X_smnc.shape)
-globals().update(locals()) # TROLOLOLOLOLOLS
-#print("i did a horrible hack :-)")
-###############################################################################
-#%% SMOTE RESAMPLING for NUMERICAL ONLY*
-# #------------------------------
-# # SMOTE: Oversampling
-# # [Numerical ONLY]
-# #------------------------------
-# k_sm = 1
-# sm = SMOTE(sampling_strategy = 'auto', k_neighbors = k_sm, **rs)
-# X_sm, y_sm = sm.fit_resample(X, y)
-# print(X_sm.shape)
-# print('SMOTE OverSampling\n', Counter(y_sm))
-# y_sm_df = y_sm.to_frame()
-# y_sm_df.value_counts().plot(kind = 'bar')
-
-# #------------------------------
-# # SMOTE: Over + Undersampling COMBINED
-# # [Numerical ONLY]
-# #-----------------------------
-# sm_enn = SMOTEENN(enn=EditedNearestNeighbours(sampling_strategy='all', **rs, **njobs ))
-# X_enn, y_enn = sm_enn.fit_resample(X, y)
-# print(X_enn.shape)
-# print('SMOTE Over+Under Sampling combined\n', Counter(y_enn))
-
-###############################################################################
-# TODO: Find over and undersampling JUST for categorical data
--- a/scripts/ml/ml_data_7030.py
+++ b/scripts/ml/ml_data_7030.py
@ -1,806 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Sun Mar  6 13:41:54 2022
-
-@author: tanu
-"""
-def setvars(gene,drug):
-    #https://stackoverflow.com/questions/51695322/compare-multiple-algorithms-with-sklearn-pipeline
-    import os, sys
-    import pandas as pd
-    import numpy as np
-    print(np.__version__)
-    print(pd.__version__)
-    import pprint as pp
-    from copy import deepcopy
-    from collections import Counter
-    from sklearn.impute import KNNImputer as KNN
-    from imblearn.over_sampling import RandomOverSampler
-    from imblearn.under_sampling import RandomUnderSampler
-    from imblearn.over_sampling import SMOTE
-    from sklearn.datasets import make_classification
-    from imblearn.combine import SMOTEENN
-    from imblearn.combine import SMOTETomek
-    
-    from imblearn.over_sampling import SMOTENC
-    from imblearn.under_sampling import EditedNearestNeighbours
-    from imblearn.under_sampling import RepeatedEditedNearestNeighbours
-    
-    from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score
-    from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report
-    
-    from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
-    from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold
-    
-    from sklearn.pipeline import Pipeline, make_pipeline
-    import argparse
-    import re
-    #%% GLOBALS
-    tts_split = "70_30"
-
-    rs = {'random_state': 42}
-    njobs = {'n_jobs': 10}
-    
-    scoring_fn =  ({ 'mcc'         : make_scorer(matthews_corrcoef)
-                    , 'accuracy'   : make_scorer(accuracy_score)
-                    , 'fscore'     : make_scorer(f1_score)
-                    , 'precision'  : make_scorer(precision_score)
-                    , 'recall'     : make_scorer(recall_score)
-                    , 'roc_auc'    : make_scorer(roc_auc_score)
-                    , 'jcc'        : make_scorer(jaccard_score)
-                }) 
-      
-    skf_cv = StratifiedKFold(n_splits = 10
-                              #, shuffle = False, random_state= None)
-                               , shuffle = True,**rs)
-    
-    rskf_cv = RepeatedStratifiedKFold(n_splits = 10
-                                      , n_repeats = 3
-                                      , **rs)
-    
-    mcc_score_fn  = {'mcc': make_scorer(matthews_corrcoef)}
-    jacc_score_fn = {'jcc': make_scorer(jaccard_score)}   
-    #%% FOR LATER: Combine ED logo data
-    ###########################################################################
-
-    homedir = os.path.expanduser("~")
-    
-    geneL_basic     = ['pnca']
-    geneL_na        = ['gid']
-    geneL_na_ppi2   = ['rpob']
-    geneL_ppi2      = ['alr', 'embb', 'katg']
-    
-    #num_type = ['int64', 'float64']
-    num_type = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
-    cat_type = ['object', 'bool']
-    
-    #==============
-    # directories
-    #==============
-    datadir = homedir + '/git/Data/'
-    indir   = datadir + drug + '/input/'
-    outdir  = datadir + drug + '/output/'
-    
-    #=======
-    # input
-    #=======
-    
-    #---------
-    # File 1
-    #---------
-    infile_ml1 = outdir + gene.lower() + '_merged_df3.csv' 
-    #infile_ml2 = outdir + gene.lower() + '_merged_df2.csv'
-    
-    my_features_df = pd.read_csv(infile_ml1, index_col = 0) 
-    my_features_df  = my_features_df .reset_index(drop = True)
-    my_features_df.index
-    
-    my_features_df.dtypes
-    mycols = my_features_df.columns
-    
-    #---------
-    # File 2
-    #---------
-    infile_aaindex = outdir + 'aa_index/' + gene.lower() + '_aa.csv' 
-    aaindex_df = pd.read_csv(infile_aaindex, index_col = 0) 
-    aaindex_df.dtypes
-    
-    #-----------
-    # check for non-numerical columns
-    #-----------
-    if any(aaindex_df.dtypes==object):
-        print('\naaindex_df contains non-numerical data')
-    
-    aaindex_df_object = aaindex_df.select_dtypes(include = cat_type)
-    print('\nTotal no. of non-numerial columns:', len(aaindex_df_object.columns))
-    
-    expected_aa_ncols = len(aaindex_df.columns) - len(aaindex_df_object.columns)
-    
-    #-----------
-    # Extract numerical data only
-    #-----------
-    print('\nSelecting numerical data only')
-    aaindex_df = aaindex_df.select_dtypes(include = num_type)
-    
-    #---------------------------
-    # aaindex: sanity check 1
-    #---------------------------
-    if len(aaindex_df.columns) == expected_aa_ncols:
-        print('\nPASS: successfully selected numerical columns only for aaindex_df')
-    else:
-        print('\nFAIL: Numbers mismatch'
-              , '\nExpected ncols:', expected_aa_ncols
-              , '\nGot:', len(aaindex_df.columns))    
-        
-    #---------------
-    # check for NA
-    #---------------
-    print('\nNow checking for NA in the remaining aaindex_cols')
-    c1 = aaindex_df.isna().sum()
-    c2 = c1.sort_values(ascending=False)
-    print('\nCounting aaindex_df cols with NA'
-          , '\nncols with NA:', sum(c2>0), 'columns'
-          , '\nDropping these...'
-          , '\nOriginal ncols:', len(aaindex_df.columns)
-          )
-    aa_df = aaindex_df.dropna(axis=1)
-    
-    print('\nRevised df ncols:', len(aa_df.columns))
-    
-    c3 = aa_df.isna().sum()
-    c4 = c3.sort_values(ascending=False)
-    
-    print('\nChecking NA in revised df...')
-    
-    if sum(c4>0):
-        sys.exit('\nFAIL: aaindex_df still contains cols with NA, please check and drop these before proceeding...')
-    else:
-        print('\nPASS: cols with NA successfully dropped from aaindex_df'
-              , '\nProceeding with combining aa_df with other features_df')
-        
-    #---------------------------
-    # aaindex: sanity check 2
-    #---------------------------
-    expected_aa_ncols2 =  len(aaindex_df.columns) - sum(c2>0)  
-    if len(aa_df.columns) == expected_aa_ncols2:
-        print('\nPASS: ncols match'
-              , '\nExpected ncols:', expected_aa_ncols2
-              , '\nGot:', len(aa_df.columns))
-    else:
-        print('\nFAIL: Numbers mismatch'
-              , '\nExpected ncols:', expected_aa_ncols2
-              , '\nGot:', len(aa_df.columns))            
-        
-    # Important: need this to identify aaindex cols    
-    aa_df_cols = aa_df.columns
-    print('\nTotal no. of columns in clean aa_df:', len(aa_df_cols))
-    
-    ###############################################################################
-    #%% Combining my_features_df and aaindex_df
-    #===========================
-    # Merge my_df + aaindex_df
-    #===========================
-    
-    if aa_df.columns[aa_df.columns.isin(my_features_df.columns)] == my_features_df.columns[my_features_df.columns.isin(aa_df.columns)]:
-        print('\nMerging on column: mutationinformation')   
-    
-    if len(my_features_df) == len(aa_df):
-        expected_nrows = len(my_features_df)
-        print('\nProceeding to merge, expected nrows in merged_df:', expected_nrows)
-    else:
-        sys.exit('\nNrows mismatch, cannot merge. Please check'
-              , '\nnrows my_df:', len(my_features_df)
-              , '\nnrows aa_df:', len(aa_df))
-               
-    #-----------------
-    # Reset index: mutationinformation
-    # Very important for merging
-    #-----------------
-    aa_df = aa_df.reset_index()
-    
-    expected_ncols = len(my_features_df.columns) + len(aa_df.columns) - 1 # for the no. of merging col
-    
-    #-----------------
-    # Merge: my_features_df + aa_df
-    #-----------------
-    merged_df = pd.merge(my_features_df
-                         , aa_df
-                         , on = 'mutationinformation')
-    
-    #---------------------------
-    # aaindex: sanity check 3
-    #---------------------------
-    if len(merged_df.columns) == expected_ncols:
-        print('\nPASS: my_features_df and aa_df successfully combined'
-              , '\nnrows:', len(merged_df)
-              , '\nncols:', len(merged_df.columns))
-    else:
-        sys.exit('\nFAIL: could not combine my_features_df and aa_df'
-                 , '\nCheck dims and merging cols!')
-        
-    #--------
-    # Reassign so downstream code doesn't need to change
-    #--------
-    my_df = merged_df.copy()
-    
-    #%% Data: my_df
-    # Check if non structural pos have crept in
-    # IDEALLY remove from source! But for rpoB do it here
-    # Drop NA where numerical cols have them
-    if gene.lower() in geneL_na_ppi2:
-        #D1148 get rid of
-        na_index = my_df['mutationinformation'].index[my_df['mcsm_na_affinity'].apply(np.isnan)]
-        my_df = my_df.drop(index=na_index)
-    
-    # FIXED: complete data for all muts inc L114M, F115L, V123L, V125I, V131M
-    # if gene.lower() in ['embb']:
-    #     na_index = my_df['mutationinformation'].index[my_df['ligand_distance'].apply(np.isnan)]
-    #     my_df = my_df.drop(index=na_index)
-    
-    # # Sanity check for non-structural positions
-    # print('\nChecking for non-structural postions')
-    # na_index = my_df['mutationinformation'].index[my_df['ligand_distance'].apply(np.isnan)]
-    # if len(na_index) > 0:
-    #     print('\nNon-structural positions detected for gene:', gene.lower()
-    #           , '\nTotal number of these detected:', len(na_index)
-    #           , '\These are at index:', na_index
-    #           , '\nOriginal nrows:', len(my_df)
-    #           , '\nDropping these...')
-    #     my_df = my_df.drop(index=na_index)
-    #     print('\nRevised nrows:', len(my_df))
-    # else:
-    #     print('\nNo non-structural positions detected for gene:', gene.lower()
-    #           , '\nnrows:', len(my_df))
-              
-    
-    ###########################################################################
-    #%% Add lineage calculation columns
-    #FIXME: Check if this can be imported from config?
-    total_mtblineage_uc = 8
-    lineage_colnames = ['lineage_list_all', 'lineage_count_all', 'lineage_count_unique', 'lineage_list_unique', 'lineage_multimode']
-    #bar = my_df[lineage_colnames]
-    my_df['lineage_proportion']      = my_df['lineage_count_unique']/my_df['lineage_count_all']
-    my_df['dist_lineage_proportion'] = my_df['lineage_count_unique']/total_mtblineage_uc
-    ###########################################################################
-    #%% Active site annotation column
-    # change from numberic to categorical
-    
-    if my_df['active_site'].dtype in num_type:
-        my_df['active_site'] = my_df['active_site'].astype(object)
-        my_df['active_site'].dtype
-    #%% AA property change
-    #--------------------
-    # Water prop change
-    #--------------------
-    my_df['water_change'] = my_df['wt_prop_water'] + str('_to_') + my_df['mut_prop_water']
-    my_df['water_change'].value_counts()
-    
-    water_prop_changeD = {
-        'hydrophobic_to_neutral'          : 'change'
-        , 'hydrophobic_to_hydrophobic'    : 'no_change'
-        , 'neutral_to_neutral'            : 'no_change'
-        , 'neutral_to_hydrophobic'        : 'change'
-        , 'hydrophobic_to_hydrophilic'    : 'change'
-        , 'neutral_to_hydrophilic'        : 'change'
-        , 'hydrophilic_to_neutral'        : 'change'
-        , 'hydrophilic_to_hydrophobic'    : 'change'
-        , 'hydrophilic_to_hydrophilic'    : 'no_change'
-    }
-    
-    my_df['water_change'] = my_df['water_change'].map(water_prop_changeD)
-    my_df['water_change'].value_counts()
-    
-    #--------------------
-    # Polarity change
-    #--------------------
-    my_df['polarity_change'] = my_df['wt_prop_polarity'] + str('_to_') + my_df['mut_prop_polarity']
-    my_df['polarity_change'].value_counts()
-    
-    polarity_prop_changeD = {
-        'non-polar_to_non-polar'     : 'no_change'
-        , 'non-polar_to_neutral'     : 'change'  
-        , 'neutral_to_non-polar'     : 'change'  
-        , 'neutral_to_neutral'       : 'no_change'  
-        , 'non-polar_to_basic'       : 'change'  
-        , 'acidic_to_neutral'        : 'change'  
-        , 'basic_to_neutral'         : 'change'  
-        , 'non-polar_to_acidic'      : 'change'  
-        , 'neutral_to_basic'         : 'change'  
-        , 'acidic_to_non-polar'      : 'change'  
-        , 'basic_to_non-polar'       : 'change'
-        , 'neutral_to_acidic'        : 'change'
-        , 'acidic_to_acidic'         : 'no_change'
-        , 'basic_to_acidic'          : 'change'
-        , 'basic_to_basic'           : 'no_change'
-        , 'acidic_to_basic'          : 'change'}
-    
-    my_df['polarity_change'] = my_df['polarity_change'].map(polarity_prop_changeD)
-    my_df['polarity_change'].value_counts()
-    
-    #--------------------
-    # Electrostatics change
-    #--------------------
-    my_df['electrostatics_change'] = my_df['wt_calcprop'] + str('_to_') + my_df['mut_calcprop']
-    my_df['electrostatics_change'].value_counts()
-    
-    calc_prop_changeD = {
-            'non-polar_to_non-polar'     : 'no_change'
-            , 'non-polar_to_polar'       : 'change'
-            , 'polar_to_non-polar'       : 'change'
-            , 'non-polar_to_pos'         : 'change'
-            , 'neg_to_non-polar'         : 'change'
-            , 'non-polar_to_neg'         : 'change'
-            , 'pos_to_polar'             : 'change'
-            , 'pos_to_non-polar'         : 'change'
-            , 'polar_to_polar'           : 'no_change'
-            , 'neg_to_neg'               : 'no_change'
-            , 'polar_to_neg'             : 'change'
-            , 'pos_to_neg'               : 'change'
-            , 'pos_to_pos'               : 'no_change'
-            , 'polar_to_pos'             : 'change'
-            , 'neg_to_polar'             : 'change'
-            , 'neg_to_pos'               : 'change'
-    }
-    
-    my_df['electrostatics_change'] = my_df['electrostatics_change'].map(calc_prop_changeD)
-    my_df['electrostatics_change'].value_counts()
-    
-    #--------------------    
-    # Summary change: Create a combined column summarising these three cols
-    #--------------------
-    detect_change = 'change'
-    check_prop_cols = ['water_change', 'polarity_change', 'electrostatics_change']
-    #my_df['aa_prop_change'] = (my_df.values == detect_change).any(1).astype(int)
-    my_df['aa_prop_change'] = (my_df[check_prop_cols].values == detect_change).any(1).astype(int)
-    my_df['aa_prop_change'].value_counts()
-    my_df['aa_prop_change'].dtype
-    
-    my_df['aa_prop_change'] = my_df['aa_prop_change'].map({1:'change'
-                                                           , 0: 'no_change'})
-    
-    my_df['aa_prop_change'].value_counts()
-    my_df['aa_prop_change'].dtype
-    
-    #%% IMPUTE values for OR [check script for exploration: UQ_or_imputer]
-    #--------------------
-    # Impute OR values
-    #--------------------
-    #or_cols = ['or_mychisq', 'log10_or_mychisq', 'or_fisher']
-    sel_cols = ['mutationinformation', 'or_mychisq', 'log10_or_mychisq']
-    or_cols = ['or_mychisq', 'log10_or_mychisq']
-    
-    print("count of NULL values before imputation\n")
-    print(my_df[or_cols].isnull().sum())
-    
-    my_dfI = pd.DataFrame(index = my_df['mutationinformation'] )
-    
-        
-    my_dfI = pd.DataFrame(KNN(n_neighbors=3, weights="uniform").fit_transform(my_df[or_cols])
-                          , index =  my_df['mutationinformation']
-                          , columns = or_cols )
-    my_dfI.columns = ['or_rawI', 'logorI']
-    my_dfI.columns
-    my_dfI = my_dfI.reset_index(drop = False) # prevents old index from being added as a column
-    my_dfI.head()
-    print("count of NULL values AFTER imputation\n")
-    print(my_dfI.isnull().sum())
-    
-    #-------------------------------------------
-    # OR df Merge: with original based on index
-    #-------------------------------------------
-    #my_df['index_bm'] = my_df.index
-    mydf_imputed = pd.merge(my_df
-                        , my_dfI
-                        , on = 'mutationinformation')
-    #mydf_imputed = mydf_imputed.set_index(['index_bm'])
-    
-    my_df['log10_or_mychisq'].isna().sum()
-    mydf_imputed['log10_or_mychisq'].isna().sum()
-    mydf_imputed['logorI'].isna().sum() # should be 0
-    
-    len(my_df.columns)
-    len(mydf_imputed.columns)  
-    
-    #-----------------------------------------
-    # REASSIGN my_df after imputing OR values
-    #-----------------------------------------
-    my_df = mydf_imputed.copy()
-    
-    if my_df['logorI'].isna().sum() == 0:
-        print('\nPASS: OR values imputed, data ready for ML')
-    else:
-        sys.exit('\nFAIL: something went wrong, Data not ready for ML. Please check upstream!')
-    
-    #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-    #---------------------------------------
-    # TODO: try other imputation like MICE
-    #---------------------------------------
-    #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-    
-    #%%########################################################################
-    #==========================
-    #     Data for ML
-    #==========================
-    my_df_ml = my_df.copy()
-    
-    # Build column names to mask for affinity chanhes
-    if gene.lower() in geneL_basic:
-        #X_stabilityN = common_cols_stabiltyN
-        gene_affinity_colnames = []# not needed as its the common ones 
-        cols_to_mask = ['ligand_affinity_change']
-        
-    if gene.lower() in geneL_ppi2:
-        gene_affinity_colnames = ['mcsm_ppi2_affinity', 'interface_dist'] 
-        #X_stabilityN = common_cols_stabiltyN + geneL_ppi2_st_cols
-        cols_to_mask = ['ligand_affinity_change', 'mcsm_ppi2_affinity']
-    
-    if gene.lower() in geneL_na:
-        gene_affinity_colnames =  ['mcsm_na_affinity'] 
-        #X_stabilityN = common_cols_stabiltyN + geneL_na_st_cols
-        cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity']
-    
-    if gene.lower() in geneL_na_ppi2:
-        gene_affinity_colnames = ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] 
-        #X_stabilityN = common_cols_stabiltyN + geneL_na_ppi2_st_cols
-        cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity', 'mcsm_ppi2_affinity']
-    
-    #=======================
-    # Masking columns:
-    # (mCSM-lig, mCSM-NA, mCSM-ppi2) values for lig_dist >10
-    #=======================
-    my_df_ml['mutationinformation'][my_df_ml['ligand_distance']>10].value_counts()
-    my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts()
-    my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask].value_counts()
-    
-    # mask the mcsm affinity related columns where ligand distance > 10
-    my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask] = 0
-    (my_df_ml['ligand_affinity_change'] == 0).sum()
-    
-    mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask]  
-    
-    #===================================================
-    # write file for check
-    mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
-    mask_check.to_csv(outdir + 'ml/' + gene.lower() + '_mask_check.csv')
-    #===================================================
-    ###############################################################################
-    #%% Feature groups (FG): Build X for Input ML 
-    ############################################################################
-    #===========================
-    # FG1: Evolutionary features
-    #===========================
-    X_evolFN =  ['consurf_score'
-               , 'snap2_score'
-               , 'provean_score']
-    
-    ###############################################################################
-    #========================
-    # FG2: Stability features
-    #========================
-    #--------
-    # common
-    #--------
-    X_common_stability_Fnum = [
-               'duet_stability_change'
-               , 'ddg_foldx'
-               , 'deepddg'
-               , 'ddg_dynamut2'
-               , 'contacts']
-    #--------
-    # FoldX
-    #--------
-    X_foldX_Fnum = [ 'electro_rr', 'electro_mm', 'electro_sm', 'electro_ss'
-    , 'disulfide_rr', 'disulfide_mm', 'disulfide_sm', 'disulfide_ss'
-    , 'hbonds_rr', 'hbonds_mm', 'hbonds_sm', 'hbonds_ss'
-    , 'partcov_rr', 'partcov_mm', 'partcov_sm', 'partcov_ss'
-    , 'vdwclashes_rr', 'vdwclashes_mm', 'vdwclashes_sm', 'vdwclashes_ss'
-    , 'volumetric_rr', 'volumetric_mm', 'volumetric_ss']
-    
-    X_stability_FN = X_common_stability_Fnum + X_foldX_Fnum
-    
-    ###############################################################################
-    #===================
-    # FG3: Affinity features
-    #===================
-    common_affinity_Fnum =  ['ligand_distance'
-                    , 'ligand_affinity_change'
-                    , 'mmcsm_lig']
-    
-    # if gene.lower() in geneL_basic:
-    #     X_affinityFN = common_affinity_Fnum 
-    # else:
-    #     X_affinityFN = common_affinity_Fnum + gene_affinity_colnames
-        
-    X_affinityFN = common_affinity_Fnum + gene_affinity_colnames
-    
-    ###############################################################################
-    #============================
-    # FG4: Residue level features
-    #============================
-    #-----------
-    # AA index
-    #-----------
-    X_aaindex_Fnum = list(aa_df_cols)
-    print('\nTotal no. of features for aaindex:', len(X_aaindex_Fnum))
-    
-    #-----------------
-    # surface area
-    # depth
-    # hydrophobicity
-    #-----------------
-    X_str_Fnum =  ['rsa'
-               #, 'asa'
-               , 'kd_values'
-               , 'rd_values']   
-    
-    #---------------------------
-    # Other aa properties
-    # active site indication
-    #---------------------------
-    X_aap_Fcat = ['ss_class'
-                # , 'wt_prop_water'
-                # , 'mut_prop_water'
-                # , 'wt_prop_polarity'
-                # , 'mut_prop_polarity'
-                # , 'wt_calcprop'
-                # , 'mut_calcprop'
-                , 'aa_prop_change'
-                , 'electrostatics_change'
-                , 'polarity_change'
-                , 'water_change'
-                , 'active_site']
-       
-    X_resprop_FN = X_aaindex_Fnum + X_str_Fnum + X_aap_Fcat
-    ###############################################################################
-    #========================
-    # FG5: Genomic features
-    #========================
-    X_gn_mafor_Fnum =  ['maf'
-                    #, 'logorI'
-                    # , 'or_rawI'
-                    # , 'or_mychisq'
-                    # , 'or_logistic'
-                    # , 'or_fisher'
-                    # , 'pval_fisher'
-                    ]
-    
-    X_gn_linegae_Fnum  = ['lineage_proportion'
-                          , 'dist_lineage_proportion'
-                          #, 'lineage' # could be included as a category but it has L2;L4  formatting
-                          , 'lineage_count_all'
-                          , 'lineage_count_unique'
-                          ]
-    
-    # X_gn_Fcat = ['drtype_mode_labels'  # beware then you can't use it to predict [USED it for uq_v1, not v2]
-    #                #, 'gene_name' # will be required for the combined stuff
-    #              ]
-    X_gn_Fcat = []
-    
-    X_genomicFN = X_gn_mafor_Fnum + X_gn_linegae_Fnum + X_gn_Fcat
-    ###############################################################################
-    #========================
-    # FG6 collapsed: Structural : Atability + Affinity + ResidueProp
-    #========================
-    X_structural_FN =  X_stability_FN + X_affinityFN + X_resprop_FN
-    
-    ###############################################################################
-    #========================
-    # BUILDING all features
-    #========================
-    all_featuresN = X_evolFN + X_structural_FN + X_genomicFN
-    
-    ###############################################################################
-    #%% Define training and test data
-    #================================================================
-    # Training and BLIND test set: 70/30
-    # dst with actual values  : training set
-    # dst with imputed values : THROW AWAY [unrepresentative]
-    #================================================================
-    my_df_ml[drug].isna().sum()
-    
-    #    blind_test_df = my_df_ml[my_df_ml[drug].isna()]
-    #    blind_test_df.shape
-    
-    training_df = my_df_ml[my_df_ml[drug].notna()]
-    training_df.shape
-    
-    # Target 1: dst_mode
-    training_df[drug].value_counts()
-    training_df['dst_mode'].value_counts()
-    
-    ####################################################################
-    #====================================
-    # ML data: Train test split: 70/30
-    # with stratification
-    # 70% : training_data for CV
-    # 30% : blind test 
-    #=====================================
-    x_features = training_df[all_featuresN]
-    y_target   = training_df['dst_mode']
-    
-    # sanity check
-    if not 'dst_mode' in x_features.columns:
-        print('\nPASS: x_features has no target variable')
-        x_ncols = len(x_features.columns)
-        print('\nNo. of columns for x_features:', x_ncols)
-        # NEED It for scaling law split
-        #https://towardsdatascience.com/finally-why-we-use-an-80-20-split-for-training-and-test-data-plus-an-alternative-method-oh-yes-edc77e96295d
-    else:
-        sys.exit('\nFAIL: x_features has target variable included. FIX it and rerun!')
-    #-------------------
-    # train-test split
-    #-------------------
-    #x_train, x_test, y_train, y_test # traditional var_names
-    # so my downstream code doesn't need to change    
-    X, X_bts, y, y_bts = train_test_split(x_features, y_target
-                                                    , test_size = 0.33
-                                                    , **rs
-                                                    , stratify = y_target)
-    yc1 = Counter(y)
-    yc1_ratio = yc1[0]/yc1[1]
-    
-    yc2 = Counter(y_bts)
-    yc2_ratio = yc2[0]/yc2[1]
-    
-    ###############################################################################
-    #======================================================
-    # Determine categorical and numerical features
-    #======================================================
-    numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
-    numerical_cols 
-    categorical_cols = X.select_dtypes(include=['object', 'bool']).columns
-    categorical_cols 
-    
-    ################################################################################
-    # IMPORTANT sanity checks
-    if len(X.columns) == len(X_evolFN) + len(X_stability_FN) + len(X_affinityFN) + len(X_resprop_FN) + len(X_genomicFN):
-        print('\nPASS: ML data with input features, training and test generated...'
-              , '\n\nTotal no. of input features:'        , len(X.columns)
-              , '\n--------No. of numerical features:'    , len(numerical_cols)
-              , '\n--------No. of categorical features:'  , len(categorical_cols)
-              
-              , '\n\nTotal no. of evolutionary features:' , len(X_evolFN)
-              
-              , '\n\nTotal no. of stability features:'    , len(X_stability_FN)
-              , '\n--------Common stabilty cols:'         , len(X_common_stability_Fnum)
-              , '\n--------Foldx cols:'                   , len(X_foldX_Fnum)
-              
-              , '\n\nTotal no. of affinity features:'     , len(X_affinityFN)
-              , '\n--------Common affinity cols:'         , len(common_affinity_Fnum)
-              , '\n--------Gene specific affinity cols:'  , len(gene_affinity_colnames)
-              
-              , '\n\nTotal no. of residue level features:', len(X_resprop_FN)
-              , '\n--------AA index cols:'                , len(X_aaindex_Fnum)
-              , '\n--------Residue Prop cols:'            , len(X_str_Fnum)
-              , '\n--------AA change Prop cols:'          , len(X_aap_Fcat)
-              
-              , '\n\nTotal no. of genomic features:'      , len(X_genomicFN)
-              , '\n--------MAF+OR cols:'                  , len(X_gn_mafor_Fnum)
-              , '\n--------Lineage cols:'                 , len(X_gn_linegae_Fnum)
-              , '\n--------Other cols:'                   , len(X_gn_Fcat)
-              )
-    else:
-        print('\nFAIL: numbers mismatch'
-              , '\nExpected:',len(X_evolFN) + len(X_stability_FN) + len(X_affinityFN) + len(X_resprop_FN) + len(X_genomicFN)
-              , '\nGot:', len(X.columns))
-        sys.exit()
-    ###############################################################################
-    print('\n-------------------------------------------------------------'
-          , '\nSuccessfully split data: ALL features'
-          , '\nactual values: training set'
-          ,  '\nSplit:', tts_split
-          #, '\nimputed values: blind test set'
-          
-          , '\n\nTotal data size:', len(X) + len(X_bts)
-    
-          , '\n\nTrain data size:', X.shape
-          , '\ny_train numbers:', yc1
-    
-          , '\n\nTest data size:', X_bts.shape
-          , '\ny_test_numbers:', yc2
-    
-          , '\n\ny_train ratio:',yc1_ratio
-          , '\ny_test ratio:', yc2_ratio
-          , '\n-------------------------------------------------------------'
-          )
-    ##########################################################################    
-    # Quick check
-    #(X['ligand_affinity_change']==0).sum() == (X['ligand_distance']>10).sum()
-    for i in range(len(cols_to_mask)):
-        ind = i+1
-        print('\nindex:', i, '\nind:', ind)
-        print('\nMask count check:'
-              , (my_df_ml[cols_to_mask[i]]==0).sum() == (my_df_ml['ligand_distance']>10).sum()
-              )
-    
-    print('Original Data\n', Counter(y)
-          , 'Data dim:', X.shape)
-    ###########################################################################
-    #%% 
-    ###########################################################################
-    #                               RESAMPLING
-    ###########################################################################
-    #------------------------------
-    # Simple Random oversampling
-    # [Numerical + catgeorical]
-    #------------------------------
-    oversample = RandomOverSampler(sampling_strategy='minority')
-    X_ros, y_ros = oversample.fit_resample(X, y)
-    print('\nSimple Random OverSampling\n', Counter(y_ros))
-    print(X_ros.shape)
-    
-    #------------------------------
-    # Simple Random Undersampling
-    # [Numerical + catgeorical]
-    #------------------------------
-    undersample = RandomUnderSampler(sampling_strategy='majority')
-    X_rus, y_rus = undersample.fit_resample(X, y)
-    print('\nSimple Random UnderSampling\n', Counter(y_rus))
-    print(X_rus.shape)
-    
-    #------------------------------
-    # Simple combine ROS and RUS
-    # [Numerical + catgeorical]
-    #------------------------------
-    oversample = RandomOverSampler(sampling_strategy='minority')
-    X_ros, y_ros = oversample.fit_resample(X, y)
-    undersample = RandomUnderSampler(sampling_strategy='majority')
-    X_rouC, y_rouC = undersample.fit_resample(X_ros, y_ros)
-    print('\nSimple Combined Over and UnderSampling\n',  Counter(y_rouC))
-    print(X_rouC.shape)
-    
-    #------------------------------
-    # SMOTE_NC: oversampling 
-    # [numerical + categorical]
-    #https://stackoverflow.com/questions/47655813/oversampling-smote-for-binary-and-categorical-data-in-python
-    #------------------------------
-    # Determine categorical and numerical features
-    numerical_ix = X.select_dtypes(include=['int64', 'float64']).columns
-    numerical_ix
-    num_featuresL = list(numerical_ix)
-    numerical_colind = X.columns.get_indexer(list(numerical_ix) )
-    numerical_colind
-    
-    categorical_ix = X.select_dtypes(include=['object', 'bool']).columns
-    categorical_ix    
-    categorical_colind = X.columns.get_indexer(list(categorical_ix))
-    categorical_colind
-    
-    k_sm = 5 # 5 is default
-    sm_nc = SMOTENC(categorical_features=categorical_colind, k_neighbors = k_sm, **rs, **njobs)
-    X_smnc, y_smnc = sm_nc.fit_resample(X, y)
-    print('\nSMOTE_NC OverSampling\n', Counter(y_smnc))
-    print(X_smnc.shape)
-    globals().update(locals()) # TROLOLOLOLOLOLS
-    #print("i did a horrible hack :-)")
-    ###############################################################################
-    #%% SMOTE RESAMPLING for NUMERICAL ONLY*
-    # #------------------------------
-    # # SMOTE: Oversampling
-    # # [Numerical ONLY]
-    # #------------------------------
-    # k_sm = 1
-    # sm = SMOTE(sampling_strategy = 'auto', k_neighbors = k_sm, **rs)
-    # X_sm, y_sm = sm.fit_resample(X, y)
-    # print(X_sm.shape)
-    # print('\nSMOTE OverSampling\n', Counter(y_sm))
-    # y_sm_df = y_sm.to_frame()
-    # y_sm_df.value_counts().plot(kind = 'bar')
-    
-    # #------------------------------
-    # # SMOTE: Over + Undersampling COMBINED
-    # # [Numerical ONLY]
-    # #-----------------------------
-    # sm_enn = SMOTEENN(enn=EditedNearestNeighbours(sampling_strategy='all', **rs, **njobs ))
-    # X_enn, y_enn = sm_enn.fit_resample(X, y)
-    # print(X_enn.shape)
-    # print('\nSMOTE Over+Under Sampling combined\n', Counter(y_enn))
-    
-    ###########################################################################
-    # TODO: Find over and undersampling JUST for categorical data
-    ###########################################################################
-    
-    print('\n#################################################################'
-          , '\nDim of X for gene:', gene.lower(), '\n',  X.shape
-          , '\n###############################################################')
--- a/scripts/ml/ml_data_8020.py
+++ b/scripts/ml/ml_data_8020.py
@ -1,806 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Sun Mar  6 13:41:54 2022
-
-@author: tanu
-"""
-def setvars(gene,drug):
-    #https://stackoverflow.com/questions/51695322/compare-multiple-algorithms-with-sklearn-pipeline
-    import os, sys
-    import pandas as pd
-    import numpy as np
-    print(np.__version__)
-    print(pd.__version__)
-    import pprint as pp
-    from copy import deepcopy
-    from collections import Counter
-    from sklearn.impute import KNNImputer as KNN
-    from imblearn.over_sampling import RandomOverSampler
-    from imblearn.under_sampling import RandomUnderSampler
-    from imblearn.over_sampling import SMOTE
-    from sklearn.datasets import make_classification
-    from imblearn.combine import SMOTEENN
-    from imblearn.combine import SMOTETomek
-    
-    from imblearn.over_sampling import SMOTENC
-    from imblearn.under_sampling import EditedNearestNeighbours
-    from imblearn.under_sampling import RepeatedEditedNearestNeighbours
-    
-    from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score
-    from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report
-    
-    from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
-    from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold
-    
-    from sklearn.pipeline import Pipeline, make_pipeline
-    import argparse
-    import re
-    #%% GLOBALS
-    tts_split = "80_20"
-
-    rs = {'random_state': 42}
-    njobs = {'n_jobs': 10}
-    
-    scoring_fn =  ({ 'mcc'         : make_scorer(matthews_corrcoef)
-                    , 'accuracy'   : make_scorer(accuracy_score)
-                    , 'fscore'     : make_scorer(f1_score)
-                    , 'precision'  : make_scorer(precision_score)
-                    , 'recall'     : make_scorer(recall_score)
-                    , 'roc_auc'    : make_scorer(roc_auc_score)
-                    , 'jcc'        : make_scorer(jaccard_score)
-                }) 
-      
-    skf_cv = StratifiedKFold(n_splits = 10
-                              #, shuffle = False, random_state= None)
-                               , shuffle = True,**rs)
-    
-    rskf_cv = RepeatedStratifiedKFold(n_splits = 10
-                                      , n_repeats = 3
-                                      , **rs)
-    
-    mcc_score_fn  = {'mcc': make_scorer(matthews_corrcoef)}
-    jacc_score_fn = {'jcc': make_scorer(jaccard_score)}   
-    #%% FOR LATER: Combine ED logo data
-    ###########################################################################
-
-    homedir = os.path.expanduser("~")
-    
-    geneL_basic     = ['pnca']
-    geneL_na        = ['gid']
-    geneL_na_ppi2   = ['rpob']
-    geneL_ppi2      = ['alr', 'embb', 'katg']
-    
-    #num_type = ['int64', 'float64']
-    num_type = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
-    cat_type = ['object', 'bool']
-    
-    #==============
-    # directories
-    #==============
-    datadir = homedir + '/git/Data/'
-    indir   = datadir + drug + '/input/'
-    outdir  = datadir + drug + '/output/'
-    
-    #=======
-    # input
-    #=======
-    
-    #---------
-    # File 1
-    #---------
-    infile_ml1 = outdir + gene.lower() + '_merged_df3.csv' 
-    #infile_ml2 = outdir + gene.lower() + '_merged_df2.csv'
-    
-    my_features_df = pd.read_csv(infile_ml1, index_col = 0) 
-    my_features_df  = my_features_df .reset_index(drop = True)
-    my_features_df.index
-    
-    my_features_df.dtypes
-    mycols = my_features_df.columns
-    
-    #---------
-    # File 2
-    #---------
-    infile_aaindex = outdir + 'aa_index/' + gene.lower() + '_aa.csv' 
-    aaindex_df = pd.read_csv(infile_aaindex, index_col = 0) 
-    aaindex_df.dtypes
-    
-    #-----------
-    # check for non-numerical columns
-    #-----------
-    if any(aaindex_df.dtypes==object):
-        print('\naaindex_df contains non-numerical data')
-    
-    aaindex_df_object = aaindex_df.select_dtypes(include = cat_type)
-    print('\nTotal no. of non-numerial columns:', len(aaindex_df_object.columns))
-    
-    expected_aa_ncols = len(aaindex_df.columns) - len(aaindex_df_object.columns)
-    
-    #-----------
-    # Extract numerical data only
-    #-----------
-    print('\nSelecting numerical data only')
-    aaindex_df = aaindex_df.select_dtypes(include = num_type)
-    
-    #---------------------------
-    # aaindex: sanity check 1
-    #---------------------------
-    if len(aaindex_df.columns) == expected_aa_ncols:
-        print('\nPASS: successfully selected numerical columns only for aaindex_df')
-    else:
-        print('\nFAIL: Numbers mismatch'
-              , '\nExpected ncols:', expected_aa_ncols
-              , '\nGot:', len(aaindex_df.columns))    
-        
-    #---------------
-    # check for NA
-    #---------------
-    print('\nNow checking for NA in the remaining aaindex_cols')
-    c1 = aaindex_df.isna().sum()
-    c2 = c1.sort_values(ascending=False)
-    print('\nCounting aaindex_df cols with NA'
-          , '\nncols with NA:', sum(c2>0), 'columns'
-          , '\nDropping these...'
-          , '\nOriginal ncols:', len(aaindex_df.columns)
-          )
-    aa_df = aaindex_df.dropna(axis=1)
-    
-    print('\nRevised df ncols:', len(aa_df.columns))
-    
-    c3 = aa_df.isna().sum()
-    c4 = c3.sort_values(ascending=False)
-    
-    print('\nChecking NA in revised df...')
-    
-    if sum(c4>0):
-        sys.exit('\nFAIL: aaindex_df still contains cols with NA, please check and drop these before proceeding...')
-    else:
-        print('\nPASS: cols with NA successfully dropped from aaindex_df'
-              , '\nProceeding with combining aa_df with other features_df')
-        
-    #---------------------------
-    # aaindex: sanity check 2
-    #---------------------------
-    expected_aa_ncols2 =  len(aaindex_df.columns) - sum(c2>0)  
-    if len(aa_df.columns) == expected_aa_ncols2:
-        print('\nPASS: ncols match'
-              , '\nExpected ncols:', expected_aa_ncols2
-              , '\nGot:', len(aa_df.columns))
-    else:
-        print('\nFAIL: Numbers mismatch'
-              , '\nExpected ncols:', expected_aa_ncols2
-              , '\nGot:', len(aa_df.columns))            
-        
-    # Important: need this to identify aaindex cols    
-    aa_df_cols = aa_df.columns
-    print('\nTotal no. of columns in clean aa_df:', len(aa_df_cols))
-    
-    ###############################################################################
-    #%% Combining my_features_df and aaindex_df
-    #===========================
-    # Merge my_df + aaindex_df
-    #===========================
-    
-    if aa_df.columns[aa_df.columns.isin(my_features_df.columns)] == my_features_df.columns[my_features_df.columns.isin(aa_df.columns)]:
-        print('\nMerging on column: mutationinformation')   
-    
-    if len(my_features_df) == len(aa_df):
-        expected_nrows = len(my_features_df)
-        print('\nProceeding to merge, expected nrows in merged_df:', expected_nrows)
-    else:
-        sys.exit('\nNrows mismatch, cannot merge. Please check'
-              , '\nnrows my_df:', len(my_features_df)
-              , '\nnrows aa_df:', len(aa_df))
-               
-    #-----------------
-    # Reset index: mutationinformation
-    # Very important for merging
-    #-----------------
-    aa_df = aa_df.reset_index()
-    
-    expected_ncols = len(my_features_df.columns) + len(aa_df.columns) - 1 # for the no. of merging col
-    
-    #-----------------
-    # Merge: my_features_df + aa_df
-    #-----------------
-    merged_df = pd.merge(my_features_df
-                         , aa_df
-                         , on = 'mutationinformation')
-    
-    #---------------------------
-    # aaindex: sanity check 3
-    #---------------------------
-    if len(merged_df.columns) == expected_ncols:
-        print('\nPASS: my_features_df and aa_df successfully combined'
-              , '\nnrows:', len(merged_df)
-              , '\nncols:', len(merged_df.columns))
-    else:
-        sys.exit('\nFAIL: could not combine my_features_df and aa_df'
-                 , '\nCheck dims and merging cols!')
-        
-    #--------
-    # Reassign so downstream code doesn't need to change
-    #--------
-    my_df = merged_df.copy()
-    
-    #%% Data: my_df
-    # Check if non structural pos have crept in
-    # IDEALLY remove from source! But for rpoB do it here
-    # Drop NA where numerical cols have them
-    if gene.lower() in geneL_na_ppi2:
-        #D1148 get rid of
-        na_index = my_df['mutationinformation'].index[my_df['mcsm_na_affinity'].apply(np.isnan)]
-        my_df = my_df.drop(index=na_index)
-    
-    # FIXED: complete data for all muts inc L114M, F115L, V123L, V125I, V131M
-    # if gene.lower() in ['embb']:
-    #     na_index = my_df['mutationinformation'].index[my_df['ligand_distance'].apply(np.isnan)]
-    #     my_df = my_df.drop(index=na_index)
-    
-    # # Sanity check for non-structural positions
-    # print('\nChecking for non-structural postions')
-    # na_index = my_df['mutationinformation'].index[my_df['ligand_distance'].apply(np.isnan)]
-    # if len(na_index) > 0:
-    #     print('\nNon-structural positions detected for gene:', gene.lower()
-    #           , '\nTotal number of these detected:', len(na_index)
-    #           , '\These are at index:', na_index
-    #           , '\nOriginal nrows:', len(my_df)
-    #           , '\nDropping these...')
-    #     my_df = my_df.drop(index=na_index)
-    #     print('\nRevised nrows:', len(my_df))
-    # else:
-    #     print('\nNo non-structural positions detected for gene:', gene.lower()
-    #           , '\nnrows:', len(my_df))
-              
-    
-    ###########################################################################
-    #%% Add lineage calculation columns
-    #FIXME: Check if this can be imported from config?
-    total_mtblineage_uc = 8
-    lineage_colnames = ['lineage_list_all', 'lineage_count_all', 'lineage_count_unique', 'lineage_list_unique', 'lineage_multimode']
-    #bar = my_df[lineage_colnames]
-    my_df['lineage_proportion']      = my_df['lineage_count_unique']/my_df['lineage_count_all']
-    my_df['dist_lineage_proportion'] = my_df['lineage_count_unique']/total_mtblineage_uc
-    ###########################################################################
-    #%% Active site annotation column
-    # change from numberic to categorical
-    
-    if my_df['active_site'].dtype in num_type:
-        my_df['active_site'] = my_df['active_site'].astype(object)
-        my_df['active_site'].dtype
-    #%% AA property change
-    #--------------------
-    # Water prop change
-    #--------------------
-    my_df['water_change'] = my_df['wt_prop_water'] + str('_to_') + my_df['mut_prop_water']
-    my_df['water_change'].value_counts()
-    
-    water_prop_changeD = {
-        'hydrophobic_to_neutral'          : 'change'
-        , 'hydrophobic_to_hydrophobic'    : 'no_change'
-        , 'neutral_to_neutral'            : 'no_change'
-        , 'neutral_to_hydrophobic'        : 'change'
-        , 'hydrophobic_to_hydrophilic'    : 'change'
-        , 'neutral_to_hydrophilic'        : 'change'
-        , 'hydrophilic_to_neutral'        : 'change'
-        , 'hydrophilic_to_hydrophobic'    : 'change'
-        , 'hydrophilic_to_hydrophilic'    : 'no_change'
-    }
-    
-    my_df['water_change'] = my_df['water_change'].map(water_prop_changeD)
-    my_df['water_change'].value_counts()
-    
-    #--------------------
-    # Polarity change
-    #--------------------
-    my_df['polarity_change'] = my_df['wt_prop_polarity'] + str('_to_') + my_df['mut_prop_polarity']
-    my_df['polarity_change'].value_counts()
-    
-    polarity_prop_changeD = {
-        'non-polar_to_non-polar'     : 'no_change'
-        , 'non-polar_to_neutral'     : 'change'  
-        , 'neutral_to_non-polar'     : 'change'  
-        , 'neutral_to_neutral'       : 'no_change'  
-        , 'non-polar_to_basic'       : 'change'  
-        , 'acidic_to_neutral'        : 'change'  
-        , 'basic_to_neutral'         : 'change'  
-        , 'non-polar_to_acidic'      : 'change'  
-        , 'neutral_to_basic'         : 'change'  
-        , 'acidic_to_non-polar'      : 'change'  
-        , 'basic_to_non-polar'       : 'change'
-        , 'neutral_to_acidic'        : 'change'
-        , 'acidic_to_acidic'         : 'no_change'
-        , 'basic_to_acidic'          : 'change'
-        , 'basic_to_basic'           : 'no_change'
-        , 'acidic_to_basic'          : 'change'}
-    
-    my_df['polarity_change'] = my_df['polarity_change'].map(polarity_prop_changeD)
-    my_df['polarity_change'].value_counts()
-    
-    #--------------------
-    # Electrostatics change
-    #--------------------
-    my_df['electrostatics_change'] = my_df['wt_calcprop'] + str('_to_') + my_df['mut_calcprop']
-    my_df['electrostatics_change'].value_counts()
-    
-    calc_prop_changeD = {
-            'non-polar_to_non-polar'     : 'no_change'
-            , 'non-polar_to_polar'       : 'change'
-            , 'polar_to_non-polar'       : 'change'
-            , 'non-polar_to_pos'         : 'change'
-            , 'neg_to_non-polar'         : 'change'
-            , 'non-polar_to_neg'         : 'change'
-            , 'pos_to_polar'             : 'change'
-            , 'pos_to_non-polar'         : 'change'
-            , 'polar_to_polar'           : 'no_change'
-            , 'neg_to_neg'               : 'no_change'
-            , 'polar_to_neg'             : 'change'
-            , 'pos_to_neg'               : 'change'
-            , 'pos_to_pos'               : 'no_change'
-            , 'polar_to_pos'             : 'change'
-            , 'neg_to_polar'             : 'change'
-            , 'neg_to_pos'               : 'change'
-    }
-    
-    my_df['electrostatics_change'] = my_df['electrostatics_change'].map(calc_prop_changeD)
-    my_df['electrostatics_change'].value_counts()
-    
-    #--------------------    
-    # Summary change: Create a combined column summarising these three cols
-    #--------------------
-    detect_change = 'change'
-    check_prop_cols = ['water_change', 'polarity_change', 'electrostatics_change']
-    #my_df['aa_prop_change'] = (my_df.values == detect_change).any(1).astype(int)
-    my_df['aa_prop_change'] = (my_df[check_prop_cols].values == detect_change).any(1).astype(int)
-    my_df['aa_prop_change'].value_counts()
-    my_df['aa_prop_change'].dtype
-    
-    my_df['aa_prop_change'] = my_df['aa_prop_change'].map({1:'change'
-                                                           , 0: 'no_change'})
-    
-    my_df['aa_prop_change'].value_counts()
-    my_df['aa_prop_change'].dtype
-    
-    #%% IMPUTE values for OR [check script for exploration: UQ_or_imputer]
-    #--------------------
-    # Impute OR values
-    #--------------------
-    #or_cols = ['or_mychisq', 'log10_or_mychisq', 'or_fisher']
-    sel_cols = ['mutationinformation', 'or_mychisq', 'log10_or_mychisq']
-    or_cols = ['or_mychisq', 'log10_or_mychisq']
-    
-    print("count of NULL values before imputation\n")
-    print(my_df[or_cols].isnull().sum())
-    
-    my_dfI = pd.DataFrame(index = my_df['mutationinformation'] )
-    
-        
-    my_dfI = pd.DataFrame(KNN(n_neighbors=3, weights="uniform").fit_transform(my_df[or_cols])
-                          , index =  my_df['mutationinformation']
-                          , columns = or_cols )
-    my_dfI.columns = ['or_rawI', 'logorI']
-    my_dfI.columns
-    my_dfI = my_dfI.reset_index(drop = False) # prevents old index from being added as a column
-    my_dfI.head()
-    print("count of NULL values AFTER imputation\n")
-    print(my_dfI.isnull().sum())
-    
-    #-------------------------------------------
-    # OR df Merge: with original based on index
-    #-------------------------------------------
-    #my_df['index_bm'] = my_df.index
-    mydf_imputed = pd.merge(my_df
-                        , my_dfI
-                        , on = 'mutationinformation')
-    #mydf_imputed = mydf_imputed.set_index(['index_bm'])
-    
-    my_df['log10_or_mychisq'].isna().sum()
-    mydf_imputed['log10_or_mychisq'].isna().sum()
-    mydf_imputed['logorI'].isna().sum() # should be 0
-    
-    len(my_df.columns)
-    len(mydf_imputed.columns)  
-    
-    #-----------------------------------------
-    # REASSIGN my_df after imputing OR values
-    #-----------------------------------------
-    my_df = mydf_imputed.copy()
-    
-    if my_df['logorI'].isna().sum() == 0:
-        print('\nPASS: OR values imputed, data ready for ML')
-    else:
-        sys.exit('\nFAIL: something went wrong, Data not ready for ML. Please check upstream!')
-    
-    #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-    #---------------------------------------
-    # TODO: try other imputation like MICE
-    #---------------------------------------
-    #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-    
-    #%%########################################################################
-    #==========================
-    #     Data for ML
-    #==========================
-    my_df_ml = my_df.copy()
-    
-    # Build column names to mask for affinity chanhes
-    if gene.lower() in geneL_basic:
-        #X_stabilityN = common_cols_stabiltyN
-        gene_affinity_colnames = []# not needed as its the common ones 
-        cols_to_mask = ['ligand_affinity_change']
-        
-    if gene.lower() in geneL_ppi2:
-        gene_affinity_colnames = ['mcsm_ppi2_affinity', 'interface_dist'] 
-        #X_stabilityN = common_cols_stabiltyN + geneL_ppi2_st_cols
-        cols_to_mask = ['ligand_affinity_change', 'mcsm_ppi2_affinity']
-    
-    if gene.lower() in geneL_na:
-        gene_affinity_colnames =  ['mcsm_na_affinity'] 
-        #X_stabilityN = common_cols_stabiltyN + geneL_na_st_cols
-        cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity']
-    
-    if gene.lower() in geneL_na_ppi2:
-        gene_affinity_colnames = ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] 
-        #X_stabilityN = common_cols_stabiltyN + geneL_na_ppi2_st_cols
-        cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity', 'mcsm_ppi2_affinity']
-    
-    #=======================
-    # Masking columns:
-    # (mCSM-lig, mCSM-NA, mCSM-ppi2) values for lig_dist >10
-    #=======================
-    my_df_ml['mutationinformation'][my_df_ml['ligand_distance']>10].value_counts()
-    my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts()
-    my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask].value_counts()
-    
-    # mask the mcsm affinity related columns where ligand distance > 10
-    my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask] = 0
-    (my_df_ml['ligand_affinity_change'] == 0).sum()
-    
-    mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask]  
-    
-    #===================================================
-    # write file for check
-    mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
-    mask_check.to_csv(outdir + 'ml/' + gene.lower() + '_mask_check.csv')
-    #===================================================
-    ###############################################################################
-    #%% Feature groups (FG): Build X for Input ML 
-    ############################################################################
-    #===========================
-    # FG1: Evolutionary features
-    #===========================
-    X_evolFN =  ['consurf_score'
-               , 'snap2_score'
-               , 'provean_score']
-    
-    ###############################################################################
-    #========================
-    # FG2: Stability features
-    #========================
-    #--------
-    # common
-    #--------
-    X_common_stability_Fnum = [
-               'duet_stability_change'
-               , 'ddg_foldx'
-               , 'deepddg'
-               , 'ddg_dynamut2'
-               , 'contacts']
-    #--------
-    # FoldX
-    #--------
-    X_foldX_Fnum = [ 'electro_rr', 'electro_mm', 'electro_sm', 'electro_ss'
-    , 'disulfide_rr', 'disulfide_mm', 'disulfide_sm', 'disulfide_ss'
-    , 'hbonds_rr', 'hbonds_mm', 'hbonds_sm', 'hbonds_ss'
-    , 'partcov_rr', 'partcov_mm', 'partcov_sm', 'partcov_ss'
-    , 'vdwclashes_rr', 'vdwclashes_mm', 'vdwclashes_sm', 'vdwclashes_ss'
-    , 'volumetric_rr', 'volumetric_mm', 'volumetric_ss']
-    
-    X_stability_FN = X_common_stability_Fnum + X_foldX_Fnum
-    
-    ###############################################################################
-    #===================
-    # FG3: Affinity features
-    #===================
-    common_affinity_Fnum =  ['ligand_distance'
-                    , 'ligand_affinity_change'
-                    , 'mmcsm_lig']
-    
-    # if gene.lower() in geneL_basic:
-    #     X_affinityFN = common_affinity_Fnum 
-    # else:
-    #     X_affinityFN = common_affinity_Fnum + gene_affinity_colnames
-        
-    X_affinityFN = common_affinity_Fnum + gene_affinity_colnames
-    
-    ###############################################################################
-    #============================
-    # FG4: Residue level features
-    #============================
-    #-----------
-    # AA index
-    #-----------
-    X_aaindex_Fnum = list(aa_df_cols)
-    print('\nTotal no. of features for aaindex:', len(X_aaindex_Fnum))
-    
-    #-----------------
-    # surface area
-    # depth
-    # hydrophobicity
-    #-----------------
-    X_str_Fnum =  ['rsa'
-               #, 'asa'
-               , 'kd_values'
-               , 'rd_values']   
-    
-    #---------------------------
-    # Other aa properties
-    # active site indication
-    #---------------------------
-    X_aap_Fcat = ['ss_class'
-                # , 'wt_prop_water'
-                # , 'mut_prop_water'
-                # , 'wt_prop_polarity'
-                # , 'mut_prop_polarity'
-                # , 'wt_calcprop'
-                # , 'mut_calcprop'
-                , 'aa_prop_change'
-                , 'electrostatics_change'
-                , 'polarity_change'
-                , 'water_change'
-                , 'active_site']
-       
-    X_resprop_FN = X_aaindex_Fnum + X_str_Fnum + X_aap_Fcat
-    ###############################################################################
-    #========================
-    # FG5: Genomic features
-    #========================
-    X_gn_mafor_Fnum =  ['maf'
-                    #, 'logorI'
-                    # , 'or_rawI'
-                    # , 'or_mychisq'
-                    # , 'or_logistic'
-                    # , 'or_fisher'
-                    # , 'pval_fisher'
-                    ]
-    
-    X_gn_linegae_Fnum  = ['lineage_proportion'
-                          , 'dist_lineage_proportion'
-                          #, 'lineage' # could be included as a category but it has L2;L4  formatting
-                          , 'lineage_count_all'
-                          , 'lineage_count_unique'
-                          ]
-    
-    # X_gn_Fcat = ['drtype_mode_labels'  # beware then you can't use it to predict [USED it for uq_v1, not v2]
-    #                #, 'gene_name' # will be required for the combined stuff
-    #              ]
-    X_gn_Fcat = []
-    
-    X_genomicFN = X_gn_mafor_Fnum + X_gn_linegae_Fnum + X_gn_Fcat
-    ###############################################################################
-    #========================
-    # FG6 collapsed: Structural : Atability + Affinity + ResidueProp
-    #========================
-    X_structural_FN =  X_stability_FN + X_affinityFN + X_resprop_FN
-    
-    ###############################################################################
-    #========================
-    # BUILDING all features
-    #========================
-    all_featuresN = X_evolFN + X_structural_FN + X_genomicFN
-    
-    ###############################################################################
-    #%% Define training and test data
-    #================================================================
-    # Training and BLIND test set: 80/20
-    # dst with actual values  : training set
-    # dst with imputed values : THROW AWAY [unrepresentative]
-    #================================================================
-    my_df_ml[drug].isna().sum()
-    
-    #    blind_test_df = my_df_ml[my_df_ml[drug].isna()]
-    #    blind_test_df.shape
-    
-    training_df = my_df_ml[my_df_ml[drug].notna()]
-    training_df.shape
-    
-    # Target 1: dst_mode
-    training_df[drug].value_counts()
-    training_df['dst_mode'].value_counts()
-    
-    ####################################################################
-    #====================================
-    # ML data: Train test split: 80/20
-    # with stratification
-    # 80% : training_data for CV
-    # 20% : blind test 
-    #=====================================
-    x_features = training_df[all_featuresN]
-    y_target   = training_df['dst_mode']
-    
-    # sanity check
-    if not 'dst_mode' in x_features.columns:
-        print('\nPASS: x_features has no target variable')
-        x_ncols = len(x_features.columns)
-        print('\nNo. of columns for x_features:', x_ncols)
-        # NEED It for scaling law split
-        #https://towardsdatascience.com/finally-why-we-use-an-80-20-split-for-training-and-test-data-plus-an-alternative-method-oh-yes-edc77e96295d
-    else:
-        sys.exit('\nFAIL: x_features has target variable included. FIX it and rerun!')
-    #-------------------
-    # train-test split
-    #-------------------
-    #x_train, x_test, y_train, y_test # traditional var_names
-    # so my downstream code doesn't need to change    
-    X, X_bts, y, y_bts = train_test_split(x_features, y_target
-                                                    , test_size = 0.2
-                                                    , **rs
-                                                    , stratify = y_target)
-    yc1 = Counter(y)
-    yc1_ratio = yc1[0]/yc1[1]
-    
-    yc2 = Counter(y_bts)
-    yc2_ratio = yc2[0]/yc2[1]
-    
-    ###############################################################################
-    #======================================================
-    # Determine categorical and numerical features
-    #======================================================
-    numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
-    numerical_cols 
-    categorical_cols = X.select_dtypes(include=['object', 'bool']).columns
-    categorical_cols 
-    
-    ################################################################################
-    # IMPORTANT sanity checks
-    if len(X.columns) == len(X_evolFN) + len(X_stability_FN) + len(X_affinityFN) + len(X_resprop_FN) + len(X_genomicFN):
-        print('\nPASS: ML data with input features, training and test generated...'
-              , '\n\nTotal no. of input features:'        , len(X.columns)
-              , '\n--------No. of numerical features:'    , len(numerical_cols)
-              , '\n--------No. of categorical features:'  , len(categorical_cols)
-              
-              , '\n\nTotal no. of evolutionary features:' , len(X_evolFN)
-              
-              , '\n\nTotal no. of stability features:'    , len(X_stability_FN)
-              , '\n--------Common stabilty cols:'         , len(X_common_stability_Fnum)
-              , '\n--------Foldx cols:'                   , len(X_foldX_Fnum)
-              
-              , '\n\nTotal no. of affinity features:'     , len(X_affinityFN)
-              , '\n--------Common affinity cols:'         , len(common_affinity_Fnum)
-              , '\n--------Gene specific affinity cols:'  , len(gene_affinity_colnames)
-              
-              , '\n\nTotal no. of residue level features:', len(X_resprop_FN)
-              , '\n--------AA index cols:'                , len(X_aaindex_Fnum)
-              , '\n--------Residue Prop cols:'            , len(X_str_Fnum)
-              , '\n--------AA change Prop cols:'          , len(X_aap_Fcat)
-              
-              , '\n\nTotal no. of genomic features:'      , len(X_genomicFN)
-              , '\n--------MAF+OR cols:'                  , len(X_gn_mafor_Fnum)
-              , '\n--------Lineage cols:'                 , len(X_gn_linegae_Fnum)
-              , '\n--------Other cols:'                   , len(X_gn_Fcat)
-              )
-    else:
-        print('\nFAIL: numbers mismatch'
-              , '\nExpected:',len(X_evolFN) + len(X_stability_FN) + len(X_affinityFN) + len(X_resprop_FN) + len(X_genomicFN)
-              , '\nGot:', len(X.columns))
-        sys.exit()
-    ###############################################################################
-    print('\n-------------------------------------------------------------'
-          , '\nSuccessfully split data: ALL features'
-          , '\nactual values: training set'
-          ,  '\nSplit:', tts_split
-          #, '\nimputed values: blind test set'
-          
-          , '\n\nTotal data size:', len(X) + len(X_bts)
-    
-          , '\n\nTrain data size:', X.shape
-          , '\ny_train numbers:', yc1
-    
-          , '\n\nTest data size:', X_bts.shape
-          , '\ny_test_numbers:', yc2
-    
-          , '\n\ny_train ratio:',yc1_ratio
-          , '\ny_test ratio:', yc2_ratio
-          , '\n-------------------------------------------------------------'
-          )
-    ##########################################################################    
-    # Quick check
-    #(X['ligand_affinity_change']==0).sum() == (X['ligand_distance']>10).sum()
-    for i in range(len(cols_to_mask)):
-        ind = i+1
-        print('\nindex:', i, '\nind:', ind)
-        print('\nMask count check:'
-              , (my_df_ml[cols_to_mask[i]]==0).sum() == (my_df_ml['ligand_distance']>10).sum()
-              )
-    
-    print('Original Data\n', Counter(y)
-          , 'Data dim:', X.shape)
-    ###########################################################################
-    #%% 
-    ###########################################################################
-    #                               RESAMPLING
-    ###########################################################################
-    #------------------------------
-    # Simple Random oversampling
-    # [Numerical + catgeorical]
-    #------------------------------
-    oversample = RandomOverSampler(sampling_strategy='minority')
-    X_ros, y_ros = oversample.fit_resample(X, y)
-    print('\nSimple Random OverSampling\n', Counter(y_ros))
-    print(X_ros.shape)
-    
-    #------------------------------
-    # Simple Random Undersampling
-    # [Numerical + catgeorical]
-    #------------------------------
-    undersample = RandomUnderSampler(sampling_strategy='majority')
-    X_rus, y_rus = undersample.fit_resample(X, y)
-    print('\nSimple Random UnderSampling\n', Counter(y_rus))
-    print(X_rus.shape)
-    
-    #------------------------------
-    # Simple combine ROS and RUS
-    # [Numerical + catgeorical]
-    #------------------------------
-    oversample = RandomOverSampler(sampling_strategy='minority')
-    X_ros, y_ros = oversample.fit_resample(X, y)
-    undersample = RandomUnderSampler(sampling_strategy='majority')
-    X_rouC, y_rouC = undersample.fit_resample(X_ros, y_ros)
-    print('\nSimple Combined Over and UnderSampling\n',  Counter(y_rouC))
-    print(X_rouC.shape)
-    
-    #------------------------------
-    # SMOTE_NC: oversampling 
-    # [numerical + categorical]
-    #https://stackoverflow.com/questions/47655813/oversampling-smote-for-binary-and-categorical-data-in-python
-    #------------------------------
-    # Determine categorical and numerical features
-    numerical_ix = X.select_dtypes(include=['int64', 'float64']).columns
-    numerical_ix
-    num_featuresL = list(numerical_ix)
-    numerical_colind = X.columns.get_indexer(list(numerical_ix) )
-    numerical_colind
-    
-    categorical_ix = X.select_dtypes(include=['object', 'bool']).columns
-    categorical_ix    
-    categorical_colind = X.columns.get_indexer(list(categorical_ix))
-    categorical_colind
-    
-    k_sm = 5 # 5 is default
-    sm_nc = SMOTENC(categorical_features=categorical_colind, k_neighbors = k_sm, **rs, **njobs)
-    X_smnc, y_smnc = sm_nc.fit_resample(X, y)
-    print('\nSMOTE_NC OverSampling\n', Counter(y_smnc))
-    print(X_smnc.shape)
-    globals().update(locals()) # TROLOLOLOLOLOLS
-    #print("i did a horrible hack :-)")
-    ###############################################################################
-    #%% SMOTE RESAMPLING for NUMERICAL ONLY*
-    # #------------------------------
-    # # SMOTE: Oversampling
-    # # [Numerical ONLY]
-    # #------------------------------
-    # k_sm = 1
-    # sm = SMOTE(sampling_strategy = 'auto', k_neighbors = k_sm, **rs)
-    # X_sm, y_sm = sm.fit_resample(X, y)
-    # print(X_sm.shape)
-    # print('\nSMOTE OverSampling\n', Counter(y_sm))
-    # y_sm_df = y_sm.to_frame()
-    # y_sm_df.value_counts().plot(kind = 'bar')
-    
-    # #------------------------------
-    # # SMOTE: Over + Undersampling COMBINED
-    # # [Numerical ONLY]
-    # #-----------------------------
-    # sm_enn = SMOTEENN(enn=EditedNearestNeighbours(sampling_strategy='all', **rs, **njobs ))
-    # X_enn, y_enn = sm_enn.fit_resample(X, y)
-    # print(X_enn.shape)
-    # print('\nSMOTE Over+Under Sampling combined\n', Counter(y_enn))
-    
-    ###############################################################################
-    # TODO: Find over and undersampling JUST for categorical data
-        ###########################################################################
-    
-    print('\n#################################################################'
-          , '\nDim of X for gene:', gene.lower(), '\n',  X.shape
-          , '\n###############################################################')
--- a/scripts/ml/ml_data_cd_7030.py
+++ b/scripts/ml/ml_data_cd_7030.py
@ -1,808 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Sun Mar  6 13:41:54 2022
-
-@author: tanu
-"""
-def setvars(gene,drug):
-    #https://stackoverflow.com/questions/51695322/compare-multiple-algorithms-with-sklearn-pipeline
-    import os, sys
-    import pandas as pd
-    import numpy as np
-    print(np.__version__)
-    print(pd.__version__)
-    import pprint as pp
-    from copy import deepcopy
-    from collections import Counter
-    from sklearn.impute import KNNImputer as KNN
-    from imblearn.over_sampling import RandomOverSampler
-    from imblearn.under_sampling import RandomUnderSampler
-    from imblearn.over_sampling import SMOTE
-    from sklearn.datasets import make_classification
-    from imblearn.combine import SMOTEENN
-    from imblearn.combine import SMOTETomek
-    
-    from imblearn.over_sampling import SMOTENC
-    from imblearn.under_sampling import EditedNearestNeighbours
-    from imblearn.under_sampling import RepeatedEditedNearestNeighbours
-    
-    from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score
-    from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report
-    
-    from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
-    from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold
-    
-    from sklearn.pipeline import Pipeline, make_pipeline
-    import argparse
-    import re
-    #%% GLOBALS
-    tts_split = "70_30"
-
-    rs = {'random_state': 42}
-    njobs = {'n_jobs': 10}
-    
-    scoring_fn =  ({ 'mcc'         : make_scorer(matthews_corrcoef)
-                    , 'accuracy'   : make_scorer(accuracy_score)
-                    , 'fscore'     : make_scorer(f1_score)
-                    , 'precision'  : make_scorer(precision_score)
-                    , 'recall'     : make_scorer(recall_score)
-                    , 'roc_auc'    : make_scorer(roc_auc_score)
-                    , 'jcc'        : make_scorer(jaccard_score)
-                }) 
-      
-    skf_cv = StratifiedKFold(n_splits = 10
-                              #, shuffle = False, random_state= None)
-                               , shuffle = True,**rs)
-    
-    rskf_cv = RepeatedStratifiedKFold(n_splits = 10
-                                      , n_repeats = 3
-                                      , **rs)
-    
-    mcc_score_fn  = {'mcc': make_scorer(matthews_corrcoef)}
-    jacc_score_fn = {'jcc': make_scorer(jaccard_score)}   
-    #%% FOR LATER: Combine ED logo data
-    ###########################################################################
-
-    homedir = os.path.expanduser("~")
-    
-    geneL_basic     = ['pnca']
-    geneL_na        = ['gid']
-    geneL_na_ppi2   = ['rpob']
-    geneL_ppi2      = ['alr', 'embb', 'katg']
-    
-    #num_type = ['int64', 'float64']
-    num_type = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
-    cat_type = ['object', 'bool']
-    
-    #==============
-    # directories
-    #==============
-    datadir = homedir + '/git/Data/'
-    indir   = datadir + drug + '/input/'
-    outdir  = datadir + drug + '/output/'
-    
-    #=======
-    # input
-    #=======
-    
-    #---------
-    # File 1
-    #---------
-    infile_ml1 = outdir + gene.lower() + '_merged_df3.csv' 
-    #infile_ml2 = outdir + gene.lower() + '_merged_df2.csv'
-    
-    my_features_df = pd.read_csv(infile_ml1, index_col = 0) 
-    my_features_df  = my_features_df .reset_index(drop = True)
-    my_features_df.index
-    
-    my_features_df.dtypes
-    mycols = my_features_df.columns
-    
-    #---------
-    # File 2
-    #---------
-    infile_aaindex = outdir + 'aa_index/' + gene.lower() + '_aa.csv' 
-    aaindex_df = pd.read_csv(infile_aaindex, index_col = 0) 
-    aaindex_df.dtypes
-    
-    #-----------
-    # check for non-numerical columns
-    #-----------
-    if any(aaindex_df.dtypes==object):
-        print('\naaindex_df contains non-numerical data')
-    
-    aaindex_df_object = aaindex_df.select_dtypes(include = cat_type)
-    print('\nTotal no. of non-numerial columns:', len(aaindex_df_object.columns))
-    
-    expected_aa_ncols = len(aaindex_df.columns) - len(aaindex_df_object.columns)
-    
-    #-----------
-    # Extract numerical data only
-    #-----------
-    print('\nSelecting numerical data only')
-    aaindex_df = aaindex_df.select_dtypes(include = num_type)
-    
-    #---------------------------
-    # aaindex: sanity check 1
-    #---------------------------
-    if len(aaindex_df.columns) == expected_aa_ncols:
-        print('\nPASS: successfully selected numerical columns only for aaindex_df')
-    else:
-        print('\nFAIL: Numbers mismatch'
-              , '\nExpected ncols:', expected_aa_ncols
-              , '\nGot:', len(aaindex_df.columns))    
-        
-    #---------------
-    # check for NA
-    #---------------
-    print('\nNow checking for NA in the remaining aaindex_cols')
-    c1 = aaindex_df.isna().sum()
-    c2 = c1.sort_values(ascending=False)
-    print('\nCounting aaindex_df cols with NA'
-          , '\nncols with NA:', sum(c2>0), 'columns'
-          , '\nDropping these...'
-          , '\nOriginal ncols:', len(aaindex_df.columns)
-          )
-    aa_df = aaindex_df.dropna(axis=1)
-    
-    print('\nRevised df ncols:', len(aa_df.columns))
-    
-    c3 = aa_df.isna().sum()
-    c4 = c3.sort_values(ascending=False)
-    
-    print('\nChecking NA in revised df...')
-    
-    if sum(c4>0):
-        sys.exit('\nFAIL: aaindex_df still contains cols with NA, please check and drop these before proceeding...')
-    else:
-        print('\nPASS: cols with NA successfully dropped from aaindex_df'
-              , '\nProceeding with combining aa_df with other features_df')
-        
-    #---------------------------
-    # aaindex: sanity check 2
-    #---------------------------
-    expected_aa_ncols2 =  len(aaindex_df.columns) - sum(c2>0)  
-    if len(aa_df.columns) == expected_aa_ncols2:
-        print('\nPASS: ncols match'
-              , '\nExpected ncols:', expected_aa_ncols2
-              , '\nGot:', len(aa_df.columns))
-    else:
-        print('\nFAIL: Numbers mismatch'
-              , '\nExpected ncols:', expected_aa_ncols2
-              , '\nGot:', len(aa_df.columns))            
-        
-    # Important: need this to identify aaindex cols    
-    aa_df_cols = aa_df.columns
-    print('\nTotal no. of columns in clean aa_df:', len(aa_df_cols))
-    
-    ###############################################################################
-    #%% Combining my_features_df and aaindex_df
-    #===========================
-    # Merge my_df + aaindex_df
-    #===========================
-    
-    if aa_df.columns[aa_df.columns.isin(my_features_df.columns)] == my_features_df.columns[my_features_df.columns.isin(aa_df.columns)]:
-        print('\nMerging on column: mutationinformation')   
-    
-    if len(my_features_df) == len(aa_df):
-        expected_nrows = len(my_features_df)
-        print('\nProceeding to merge, expected nrows in merged_df:', expected_nrows)
-    else:
-        sys.exit('\nNrows mismatch, cannot merge. Please check'
-              , '\nnrows my_df:', len(my_features_df)
-              , '\nnrows aa_df:', len(aa_df))
-               
-    #-----------------
-    # Reset index: mutationinformation
-    # Very important for merging
-    #-----------------
-    aa_df = aa_df.reset_index()
-    
-    expected_ncols = len(my_features_df.columns) + len(aa_df.columns) - 1 # for the no. of merging col
-    
-    #-----------------
-    # Merge: my_features_df + aa_df
-    #-----------------
-    merged_df = pd.merge(my_features_df
-                         , aa_df
-                         , on = 'mutationinformation')
-    
-    #---------------------------
-    # aaindex: sanity check 3
-    #---------------------------
-    if len(merged_df.columns) == expected_ncols:
-        print('\nPASS: my_features_df and aa_df successfully combined'
-              , '\nnrows:', len(merged_df)
-              , '\nncols:', len(merged_df.columns))
-    else:
-        sys.exit('\nFAIL: could not combine my_features_df and aa_df'
-                 , '\nCheck dims and merging cols!')
-        
-    #--------
-    # Reassign so downstream code doesn't need to change
-    #--------
-    my_df = merged_df.copy()
-    
-    #%% Data: my_df
-    # Check if non structural pos have crept in
-    # IDEALLY remove from source! But for rpoB do it here
-    # Drop NA where numerical cols have them
-    if gene.lower() in geneL_na_ppi2:
-        #D1148 get rid of
-        na_index = my_df['mutationinformation'].index[my_df['mcsm_na_affinity'].apply(np.isnan)]
-        my_df = my_df.drop(index=na_index)
-    
-    # FIXED: complete data for all muts inc L114M, F115L, V123L, V125I, V131M
-    # if gene.lower() in ['embb']:
-    #     na_index = my_df['mutationinformation'].index[my_df['ligand_distance'].apply(np.isnan)]
-    #     my_df = my_df.drop(index=na_index)
-    
-    # # Sanity check for non-structural positions
-    # print('\nChecking for non-structural postions')
-    # na_index = my_df['mutationinformation'].index[my_df['ligand_distance'].apply(np.isnan)]
-    # if len(na_index) > 0:
-    #     print('\nNon-structural positions detected for gene:', gene.lower()
-    #           , '\nTotal number of these detected:', len(na_index)
-    #           , '\These are at index:', na_index
-    #           , '\nOriginal nrows:', len(my_df)
-    #           , '\nDropping these...')
-    #     my_df = my_df.drop(index=na_index)
-    #     print('\nRevised nrows:', len(my_df))
-    # else:
-    #     print('\nNo non-structural positions detected for gene:', gene.lower()
-    #           , '\nnrows:', len(my_df))
-              
-    
-    ###########################################################################
-    #%% Add lineage calculation columns
-    #FIXME: Check if this can be imported from config?
-    total_mtblineage_uc = 8
-    lineage_colnames = ['lineage_list_all', 'lineage_count_all', 'lineage_count_unique', 'lineage_list_unique', 'lineage_multimode']
-    #bar = my_df[lineage_colnames]
-    my_df['lineage_proportion']      = my_df['lineage_count_unique']/my_df['lineage_count_all']
-    my_df['dist_lineage_proportion'] = my_df['lineage_count_unique']/total_mtblineage_uc
-    ###########################################################################
-    #%% Active site annotation column
-    # change from numberic to categorical
-    
-    if my_df['active_site'].dtype in num_type:
-        my_df['active_site'] = my_df['active_site'].astype(object)
-        my_df['active_site'].dtype
-    #%% AA property change
-    #--------------------
-    # Water prop change
-    #--------------------
-    my_df['water_change'] = my_df['wt_prop_water'] + str('_to_') + my_df['mut_prop_water']
-    my_df['water_change'].value_counts()
-    
-    water_prop_changeD = {
-        'hydrophobic_to_neutral'          : 'change'
-        , 'hydrophobic_to_hydrophobic'    : 'no_change'
-        , 'neutral_to_neutral'            : 'no_change'
-        , 'neutral_to_hydrophobic'        : 'change'
-        , 'hydrophobic_to_hydrophilic'    : 'change'
-        , 'neutral_to_hydrophilic'        : 'change'
-        , 'hydrophilic_to_neutral'        : 'change'
-        , 'hydrophilic_to_hydrophobic'    : 'change'
-        , 'hydrophilic_to_hydrophilic'    : 'no_change'
-    }
-    
-    my_df['water_change'] = my_df['water_change'].map(water_prop_changeD)
-    my_df['water_change'].value_counts()
-    
-    #--------------------
-    # Polarity change
-    #--------------------
-    my_df['polarity_change'] = my_df['wt_prop_polarity'] + str('_to_') + my_df['mut_prop_polarity']
-    my_df['polarity_change'].value_counts()
-    
-    polarity_prop_changeD = {
-        'non-polar_to_non-polar'     : 'no_change'
-        , 'non-polar_to_neutral'     : 'change'  
-        , 'neutral_to_non-polar'     : 'change'  
-        , 'neutral_to_neutral'       : 'no_change'  
-        , 'non-polar_to_basic'       : 'change'  
-        , 'acidic_to_neutral'        : 'change'  
-        , 'basic_to_neutral'         : 'change'  
-        , 'non-polar_to_acidic'      : 'change'  
-        , 'neutral_to_basic'         : 'change'  
-        , 'acidic_to_non-polar'      : 'change'  
-        , 'basic_to_non-polar'       : 'change'
-        , 'neutral_to_acidic'        : 'change'
-        , 'acidic_to_acidic'         : 'no_change'
-        , 'basic_to_acidic'          : 'change'
-        , 'basic_to_basic'           : 'no_change'
-        , 'acidic_to_basic'          : 'change'}
-    
-    my_df['polarity_change'] = my_df['polarity_change'].map(polarity_prop_changeD)
-    my_df['polarity_change'].value_counts()
-    
-    #--------------------
-    # Electrostatics change
-    #--------------------
-    my_df['electrostatics_change'] = my_df['wt_calcprop'] + str('_to_') + my_df['mut_calcprop']
-    my_df['electrostatics_change'].value_counts()
-    
-    calc_prop_changeD = {
-            'non-polar_to_non-polar'     : 'no_change'
-            , 'non-polar_to_polar'       : 'change'
-            , 'polar_to_non-polar'       : 'change'
-            , 'non-polar_to_pos'         : 'change'
-            , 'neg_to_non-polar'         : 'change'
-            , 'non-polar_to_neg'         : 'change'
-            , 'pos_to_polar'             : 'change'
-            , 'pos_to_non-polar'         : 'change'
-            , 'polar_to_polar'           : 'no_change'
-            , 'neg_to_neg'               : 'no_change'
-            , 'polar_to_neg'             : 'change'
-            , 'pos_to_neg'               : 'change'
-            , 'pos_to_pos'               : 'no_change'
-            , 'polar_to_pos'             : 'change'
-            , 'neg_to_polar'             : 'change'
-            , 'neg_to_pos'               : 'change'
-    }
-    
-    my_df['electrostatics_change'] = my_df['electrostatics_change'].map(calc_prop_changeD)
-    my_df['electrostatics_change'].value_counts()
-    
-    #--------------------    
-    # Summary change: Create a combined column summarising these three cols
-    #--------------------
-    detect_change = 'change'
-    check_prop_cols = ['water_change', 'polarity_change', 'electrostatics_change']
-    #my_df['aa_prop_change'] = (my_df.values == detect_change).any(1).astype(int)
-    my_df['aa_prop_change'] = (my_df[check_prop_cols].values == detect_change).any(1).astype(int)
-    my_df['aa_prop_change'].value_counts()
-    my_df['aa_prop_change'].dtype
-    
-    my_df['aa_prop_change'] = my_df['aa_prop_change'].map({1:'change'
-                                                           , 0: 'no_change'})
-    
-    my_df['aa_prop_change'].value_counts()
-    my_df['aa_prop_change'].dtype
-    
-    #%% IMPUTE values for OR [check script for exploration: UQ_or_imputer]
-    #--------------------
-    # Impute OR values
-    #--------------------
-    #or_cols = ['or_mychisq', 'log10_or_mychisq', 'or_fisher']
-    sel_cols = ['mutationinformation', 'or_mychisq', 'log10_or_mychisq']
-    or_cols = ['or_mychisq', 'log10_or_mychisq']
-    
-    print("count of NULL values before imputation\n")
-    print(my_df[or_cols].isnull().sum())
-    
-    my_dfI = pd.DataFrame(index = my_df['mutationinformation'] )
-    
-        
-    my_dfI = pd.DataFrame(KNN(n_neighbors=3, weights="uniform").fit_transform(my_df[or_cols])
-                          , index =  my_df['mutationinformation']
-                          , columns = or_cols )
-    my_dfI.columns = ['or_rawI', 'logorI']
-    my_dfI.columns
-    my_dfI = my_dfI.reset_index(drop = False) # prevents old index from being added as a column
-    my_dfI.head()
-    print("count of NULL values AFTER imputation\n")
-    print(my_dfI.isnull().sum())
-    
-    #-------------------------------------------
-    # OR df Merge: with original based on index
-    #-------------------------------------------
-    #my_df['index_bm'] = my_df.index
-    mydf_imputed = pd.merge(my_df
-                        , my_dfI
-                        , on = 'mutationinformation')
-    #mydf_imputed = mydf_imputed.set_index(['index_bm'])
-    
-    my_df['log10_or_mychisq'].isna().sum()
-    mydf_imputed['log10_or_mychisq'].isna().sum()
-    mydf_imputed['logorI'].isna().sum() # should be 0
-    
-    len(my_df.columns)
-    len(mydf_imputed.columns)  
-    
-    #-----------------------------------------
-    # REASSIGN my_df after imputing OR values
-    #-----------------------------------------
-    my_df = mydf_imputed.copy()
-    
-    if my_df['logorI'].isna().sum() == 0:
-        print('\nPASS: OR values imputed, data ready for ML')
-    else:
-        sys.exit('\nFAIL: something went wrong, Data not ready for ML. Please check upstream!')
-    
-    #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-    #---------------------------------------
-    # TODO: try other imputation like MICE
-    #---------------------------------------
-    #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-    
-    #%%########################################################################
-    #==========================
-    #     Data for ML
-    #==========================
-    my_df_ml = my_df.copy()
-    
-    # Build column names to mask for affinity chanhes
-    if gene.lower() in geneL_basic:
-        #X_stabilityN = common_cols_stabiltyN
-        gene_affinity_colnames = []# not needed as its the common ones 
-        cols_to_mask = ['ligand_affinity_change']
-        
-    if gene.lower() in geneL_ppi2:
-        gene_affinity_colnames = ['mcsm_ppi2_affinity', 'interface_dist'] 
-        #X_stabilityN = common_cols_stabiltyN + geneL_ppi2_st_cols
-        cols_to_mask = ['ligand_affinity_change', 'mcsm_ppi2_affinity']
-    
-    if gene.lower() in geneL_na:
-        gene_affinity_colnames =  ['mcsm_na_affinity'] 
-        #X_stabilityN = common_cols_stabiltyN + geneL_na_st_cols
-        cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity']
-    
-    if gene.lower() in geneL_na_ppi2:
-        gene_affinity_colnames = ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] 
-        #X_stabilityN = common_cols_stabiltyN + geneL_na_ppi2_st_cols
-        cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity', 'mcsm_ppi2_affinity']
-    
-    #=======================
-    # Masking columns:
-    # (mCSM-lig, mCSM-NA, mCSM-ppi2) values for lig_dist >10
-    #=======================
-    my_df_ml['mutationinformation'][my_df_ml['ligand_distance']>10].value_counts()
-    my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts()
-    my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask].value_counts()
-    
-    # mask the mcsm affinity related columns where ligand distance > 10
-    my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask] = 0
-    (my_df_ml['ligand_affinity_change'] == 0).sum()
-    
-    mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask]  
-    
-    #===================================================
-    # write file for check
-    mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
-    mask_check.to_csv(outdir + 'ml/' + gene.lower() + '_mask_check.csv')
-    #===================================================
-    ###############################################################################
-    #%% Feature groups (FG): Build X for Input ML 
-    ############################################################################
-    #===========================
-    # FG1: Evolutionary features
-    #===========================
-    X_evolFN =  ['consurf_score'
-               , 'snap2_score'
-               , 'provean_score']
-    
-    ###############################################################################
-    #========================
-    # FG2: Stability features
-    #========================
-    #--------
-    # common
-    #--------
-    X_common_stability_Fnum = [
-               'duet_stability_change'
-               , 'ddg_foldx'
-               , 'deepddg'
-               , 'ddg_dynamut2'
-               , 'contacts']
-    #--------
-    # FoldX
-    #--------
-    X_foldX_Fnum = [ 'electro_rr', 'electro_mm', 'electro_sm', 'electro_ss'
-    , 'disulfide_rr', 'disulfide_mm', 'disulfide_sm', 'disulfide_ss'
-    , 'hbonds_rr', 'hbonds_mm', 'hbonds_sm', 'hbonds_ss'
-    , 'partcov_rr', 'partcov_mm', 'partcov_sm', 'partcov_ss'
-    , 'vdwclashes_rr', 'vdwclashes_mm', 'vdwclashes_sm', 'vdwclashes_ss'
-    , 'volumetric_rr', 'volumetric_mm', 'volumetric_ss']
-    
-    X_stability_FN = X_common_stability_Fnum + X_foldX_Fnum
-    
-    ###############################################################################
-    #===================
-    # FG3: Affinity features
-    #===================
-    common_affinity_Fnum =  ['ligand_distance'
-                    , 'ligand_affinity_change'
-                    , 'mmcsm_lig']
-    
-    # if gene.lower() in geneL_basic:
-    #     X_affinityFN = common_affinity_Fnum 
-    # else:
-    #     X_affinityFN = common_affinity_Fnum + gene_affinity_colnames
-        
-    X_affinityFN = common_affinity_Fnum + gene_affinity_colnames
-    
-    ###############################################################################
-    #============================
-    # FG4: Residue level features
-    #============================
-    #-----------
-    # AA index
-    #-----------
-    X_aaindex_Fnum = list(aa_df_cols)
-    print('\nTotal no. of features for aaindex:', len(X_aaindex_Fnum))
-    
-    #-----------------
-    # surface area
-    # depth
-    # hydrophobicity
-    #-----------------
-    X_str_Fnum =  ['rsa'
-               #, 'asa'
-               , 'kd_values'
-               , 'rd_values']   
-    
-    #---------------------------
-    # Other aa properties
-    # active site indication
-    #---------------------------
-    X_aap_Fcat = ['ss_class'
-                # , 'wt_prop_water'
-                # , 'mut_prop_water'
-                # , 'wt_prop_polarity'
-                # , 'mut_prop_polarity'
-                # , 'wt_calcprop'
-                # , 'mut_calcprop'
-                , 'aa_prop_change'
-                , 'electrostatics_change'
-                , 'polarity_change'
-                , 'water_change'
-                , 'active_site']
-       
-    X_resprop_FN = X_aaindex_Fnum + X_str_Fnum + X_aap_Fcat
-    ###############################################################################
-    #========================
-    # FG5: Genomic features
-    #========================
-    X_gn_mafor_Fnum =  ['maf'
-                    #, 'logorI'
-                    # , 'or_rawI'
-                    # , 'or_mychisq'
-                    # , 'or_logistic'
-                    # , 'or_fisher'
-                    # , 'pval_fisher'
-                    ]
-    
-    X_gn_linegae_Fnum  = ['lineage_proportion'
-                          , 'dist_lineage_proportion'
-                          #, 'lineage' # could be included as a category but it has L2;L4  formatting
-                          , 'lineage_count_all'
-                          , 'lineage_count_unique'
-                          ]
-    
-    # X_gn_Fcat = ['drtype_mode_labels'  # beware then you can't use it to predict [USED it for uq_v1, not v2]
-    #                #, 'gene_name' # will be required for the combined stuff
-    #              ]
-    X_gn_Fcat = []
-    
-    X_genomicFN = X_gn_mafor_Fnum + X_gn_linegae_Fnum + X_gn_Fcat
-    ###############################################################################
-    #========================
-    # FG6 collapsed: Structural : Atability + Affinity + ResidueProp
-    #========================
-    X_structural_FN =  X_stability_FN + X_affinityFN + X_resprop_FN
-    
-    ###############################################################################
-    #========================
-    # BUILDING all features
-    #========================
-    all_featuresN = X_evolFN + X_structural_FN + X_genomicFN
-    
-    ###############################################################################
-    #%% Define training and test data
-    #================================================================
-    # Training and BLIND test set: 70/30
-    # dst with actual values  : training set
-    # dst with imputed values : THROW AWAY [unrepresentative]
-    #================================================================
-    my_df_ml[drug].isna().sum()
-    
-    #    blind_test_df = my_df_ml[my_df_ml[drug].isna()]
-    #    blind_test_df.shape
-    
-    #training_df = my_df_ml[my_df_ml[drug].notna()]
-    #training_df.shape
-    
-    training_df = my_df_ml.copy()
-    
-    # Target 1: dst_mode
-    training_df[drug].value_counts()
-    training_df['dst_mode'].value_counts()
-    
-    ####################################################################
-    #====================================
-    # ML data: Train test split: 70/30
-    # with stratification
-    # 70% : training_data for CV
-    # 30% : blind test 
-    #=====================================
-    x_features = training_df[all_featuresN]
-    y_target   = training_df['dst_mode']
-    
-    # sanity check
-    if not 'dst_mode' in x_features.columns:
-        print('\nPASS: x_features has no target variable')
-        x_ncols = len(x_features.columns)
-        print('\nNo. of columns for x_features:', x_ncols)
-        # NEED It for scaling law split
-        #https://towardsdatascience.com/finally-why-we-use-an-80-20-split-for-training-and-test-data-plus-an-alternative-method-oh-yes-edc77e96295d
-    else:
-        sys.exit('\nFAIL: x_features has target variable included. FIX it and rerun!')
-    #-------------------
-    # train-test split
-    #-------------------
-    #x_train, x_test, y_train, y_test # traditional var_names
-    # so my downstream code doesn't need to change    
-    X, X_bts, y, y_bts = train_test_split(x_features, y_target
-                                                    , test_size = 0.33
-                                                    , **rs
-                                                    , stratify = y_target)
-    yc1 = Counter(y)
-    yc1_ratio = yc1[0]/yc1[1]
-    
-    yc2 = Counter(y_bts)
-    yc2_ratio = yc2[0]/yc2[1]
-    
-    ###############################################################################
-    #======================================================
-    # Determine categorical and numerical features
-    #======================================================
-    numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
-    numerical_cols 
-    categorical_cols = X.select_dtypes(include=['object', 'bool']).columns
-    categorical_cols 
-    
-    ################################################################################
-    # IMPORTANT sanity checks
-    if len(X.columns) == len(X_evolFN) + len(X_stability_FN) + len(X_affinityFN) + len(X_resprop_FN) + len(X_genomicFN):
-        print('\nPASS: ML data with input features, training and test generated...'
-              , '\n\nTotal no. of input features:'        , len(X.columns)
-              , '\n--------No. of numerical features:'    , len(numerical_cols)
-              , '\n--------No. of categorical features:'  , len(categorical_cols)
-              
-              , '\n\nTotal no. of evolutionary features:' , len(X_evolFN)
-              
-              , '\n\nTotal no. of stability features:'    , len(X_stability_FN)
-              , '\n--------Common stabilty cols:'         , len(X_common_stability_Fnum)
-              , '\n--------Foldx cols:'                   , len(X_foldX_Fnum)
-              
-              , '\n\nTotal no. of affinity features:'     , len(X_affinityFN)
-              , '\n--------Common affinity cols:'         , len(common_affinity_Fnum)
-              , '\n--------Gene specific affinity cols:'  , len(gene_affinity_colnames)
-              
-              , '\n\nTotal no. of residue level features:', len(X_resprop_FN)
-              , '\n--------AA index cols:'                , len(X_aaindex_Fnum)
-              , '\n--------Residue Prop cols:'            , len(X_str_Fnum)
-              , '\n--------AA change Prop cols:'          , len(X_aap_Fcat)
-              
-              , '\n\nTotal no. of genomic features:'      , len(X_genomicFN)
-              , '\n--------MAF+OR cols:'                  , len(X_gn_mafor_Fnum)
-              , '\n--------Lineage cols:'                 , len(X_gn_linegae_Fnum)
-              , '\n--------Other cols:'                   , len(X_gn_Fcat)
-              )
-    else:
-        print('\nFAIL: numbers mismatch'
-              , '\nExpected:',len(X_evolFN) + len(X_stability_FN) + len(X_affinityFN) + len(X_resprop_FN) + len(X_genomicFN)
-              , '\nGot:', len(X.columns))
-        sys.exit()
-    ###############################################################################
-    print('\n-------------------------------------------------------------'
-          , '\nSuccessfully split data: ALL features'
-          , '\nactual values: training set'
-          ,  '\nSplit:', tts_split
-          #, '\nimputed values: blind test set'
-          
-          , '\n\nTotal data size:', len(X) + len(X_bts)
-    
-          , '\n\nTrain data size:', X.shape
-          , '\ny_train numbers:', yc1
-    
-          , '\n\nTest data size:', X_bts.shape
-          , '\ny_test_numbers:', yc2
-    
-          , '\n\ny_train ratio:',yc1_ratio
-          , '\ny_test ratio:', yc2_ratio
-          , '\n-------------------------------------------------------------'
-          )
-    ##########################################################################    
-    # Quick check
-    #(X['ligand_affinity_change']==0).sum() == (X['ligand_distance']>10).sum()
-    for i in range(len(cols_to_mask)):
-        ind = i+1
-        print('\nindex:', i, '\nind:', ind)
-        print('\nMask count check:'
-              , (my_df_ml[cols_to_mask[i]]==0).sum() == (my_df_ml['ligand_distance']>10).sum()
-              )
-    
-    print('Original Data\n', Counter(y)
-          , 'Data dim:', X.shape)
-    ###########################################################################
-    #%% 
-    ###########################################################################
-    #                               RESAMPLING
-    ###########################################################################
-    #------------------------------
-    # Simple Random oversampling
-    # [Numerical + catgeorical]
-    #------------------------------
-    oversample = RandomOverSampler(sampling_strategy='minority')
-    X_ros, y_ros = oversample.fit_resample(X, y)
-    print('\nSimple Random OverSampling\n', Counter(y_ros))
-    print(X_ros.shape)
-    
-    #------------------------------
-    # Simple Random Undersampling
-    # [Numerical + catgeorical]
-    #------------------------------
-    undersample = RandomUnderSampler(sampling_strategy='majority')
-    X_rus, y_rus = undersample.fit_resample(X, y)
-    print('\nSimple Random UnderSampling\n', Counter(y_rus))
-    print(X_rus.shape)
-    
-    #------------------------------
-    # Simple combine ROS and RUS
-    # [Numerical + catgeorical]
-    #------------------------------
-    oversample = RandomOverSampler(sampling_strategy='minority')
-    X_ros, y_ros = oversample.fit_resample(X, y)
-    undersample = RandomUnderSampler(sampling_strategy='majority')
-    X_rouC, y_rouC = undersample.fit_resample(X_ros, y_ros)
-    print('\nSimple Combined Over and UnderSampling\n',  Counter(y_rouC))
-    print(X_rouC.shape)
-    
-    #------------------------------
-    # SMOTE_NC: oversampling 
-    # [numerical + categorical]
-    #https://stackoverflow.com/questions/47655813/oversampling-smote-for-binary-and-categorical-data-in-python
-    #------------------------------
-    # Determine categorical and numerical features
-    numerical_ix = X.select_dtypes(include=['int64', 'float64']).columns
-    numerical_ix
-    num_featuresL = list(numerical_ix)
-    numerical_colind = X.columns.get_indexer(list(numerical_ix) )
-    numerical_colind
-    
-    categorical_ix = X.select_dtypes(include=['object', 'bool']).columns
-    categorical_ix    
-    categorical_colind = X.columns.get_indexer(list(categorical_ix))
-    categorical_colind
-    
-    k_sm = 5 # 5 is default
-    sm_nc = SMOTENC(categorical_features=categorical_colind, k_neighbors = k_sm, **rs, **njobs)
-    X_smnc, y_smnc = sm_nc.fit_resample(X, y)
-    print('\nSMOTE_NC OverSampling\n', Counter(y_smnc))
-    print(X_smnc.shape)
-    globals().update(locals()) # TROLOLOLOLOLOLS
-    #print("i did a horrible hack :-)")
-    ###############################################################################
-    #%% SMOTE RESAMPLING for NUMERICAL ONLY*
-    # #------------------------------
-    # # SMOTE: Oversampling
-    # # [Numerical ONLY]
-    # #------------------------------
-    # k_sm = 1
-    # sm = SMOTE(sampling_strategy = 'auto', k_neighbors = k_sm, **rs)
-    # X_sm, y_sm = sm.fit_resample(X, y)
-    # print(X_sm.shape)
-    # print('\nSMOTE OverSampling\n', Counter(y_sm))
-    # y_sm_df = y_sm.to_frame()
-    # y_sm_df.value_counts().plot(kind = 'bar')
-    
-    # #------------------------------
-    # # SMOTE: Over + Undersampling COMBINED
-    # # [Numerical ONLY]
-    # #-----------------------------
-    # sm_enn = SMOTEENN(enn=EditedNearestNeighbours(sampling_strategy='all', **rs, **njobs ))
-    # X_enn, y_enn = sm_enn.fit_resample(X, y)
-    # print(X_enn.shape)
-    # print('\nSMOTE Over+Under Sampling combined\n', Counter(y_enn))
-    
-    ###############################################################################
-    # TODO: Find over and undersampling JUST for categorical data
-    ###########################################################################
-    
-    print('\n#################################################################'
-          , '\nDim of X for gene:', gene.lower(), '\n',  X.shape
-          , '\n###############################################################')
--- a/scripts/ml/ml_data_cd_8020.py
+++ b/scripts/ml/ml_data_cd_8020.py
@ -1,808 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Sun Mar  6 13:41:54 2022
-
-@author: tanu
-"""
-def setvars(gene,drug):
-    #https://stackoverflow.com/questions/51695322/compare-multiple-algorithms-with-sklearn-pipeline
-    import os, sys
-    import pandas as pd
-    import numpy as np
-    print(np.__version__)
-    print(pd.__version__)
-    import pprint as pp
-    from copy import deepcopy
-    from collections import Counter
-    from sklearn.impute import KNNImputer as KNN
-    from imblearn.over_sampling import RandomOverSampler
-    from imblearn.under_sampling import RandomUnderSampler
-    from imblearn.over_sampling import SMOTE
-    from sklearn.datasets import make_classification
-    from imblearn.combine import SMOTEENN
-    from imblearn.combine import SMOTETomek
-    
-    from imblearn.over_sampling import SMOTENC
-    from imblearn.under_sampling import EditedNearestNeighbours
-    from imblearn.under_sampling import RepeatedEditedNearestNeighbours
-    
-    from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score
-    from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report
-    
-    from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
-    from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold
-    
-    from sklearn.pipeline import Pipeline, make_pipeline
-    import argparse
-    import re
-    #%% GLOBALS
-    tts_split = "80_20"
-
-    rs = {'random_state': 42}
-    njobs = {'n_jobs': 10}
-    
-    scoring_fn =  ({ 'mcc'         : make_scorer(matthews_corrcoef)
-                    , 'accuracy'   : make_scorer(accuracy_score)
-                    , 'fscore'     : make_scorer(f1_score)
-                    , 'precision'  : make_scorer(precision_score)
-                    , 'recall'     : make_scorer(recall_score)
-                    , 'roc_auc'    : make_scorer(roc_auc_score)
-                    , 'jcc'        : make_scorer(jaccard_score)
-                }) 
-      
-    skf_cv = StratifiedKFold(n_splits = 10
-                              #, shuffle = False, random_state= None)
-                               , shuffle = True,**rs)
-    
-    rskf_cv = RepeatedStratifiedKFold(n_splits = 10
-                                      , n_repeats = 3
-                                      , **rs)
-    
-    mcc_score_fn  = {'mcc': make_scorer(matthews_corrcoef)}
-    jacc_score_fn = {'jcc': make_scorer(jaccard_score)}   
-    #%% FOR LATER: Combine ED logo data
-    ###########################################################################
-
-    homedir = os.path.expanduser("~")
-    
-    geneL_basic     = ['pnca']
-    geneL_na        = ['gid']
-    geneL_na_ppi2   = ['rpob']
-    geneL_ppi2      = ['alr', 'embb', 'katg']
-    
-    #num_type = ['int64', 'float64']
-    num_type = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
-    cat_type = ['object', 'bool']
-    
-    #==============
-    # directories
-    #==============
-    datadir = homedir + '/git/Data/'
-    indir   = datadir + drug + '/input/'
-    outdir  = datadir + drug + '/output/'
-    
-    #=======
-    # input
-    #=======
-    
-    #---------
-    # File 1
-    #---------
-    infile_ml1 = outdir + gene.lower() + '_merged_df3.csv' 
-    #infile_ml2 = outdir + gene.lower() + '_merged_df2.csv'
-    
-    my_features_df = pd.read_csv(infile_ml1, index_col = 0) 
-    my_features_df  = my_features_df .reset_index(drop = True)
-    my_features_df.index
-    
-    my_features_df.dtypes
-    mycols = my_features_df.columns
-    
-    #---------
-    # File 2
-    #---------
-    infile_aaindex = outdir + 'aa_index/' + gene.lower() + '_aa.csv' 
-    aaindex_df = pd.read_csv(infile_aaindex, index_col = 0) 
-    aaindex_df.dtypes
-    
-    #-----------
-    # check for non-numerical columns
-    #-----------
-    if any(aaindex_df.dtypes==object):
-        print('\naaindex_df contains non-numerical data')
-    
-    aaindex_df_object = aaindex_df.select_dtypes(include = cat_type)
-    print('\nTotal no. of non-numerial columns:', len(aaindex_df_object.columns))
-    
-    expected_aa_ncols = len(aaindex_df.columns) - len(aaindex_df_object.columns)
-    
-    #-----------
-    # Extract numerical data only
-    #-----------
-    print('\nSelecting numerical data only')
-    aaindex_df = aaindex_df.select_dtypes(include = num_type)
-    
-    #---------------------------
-    # aaindex: sanity check 1
-    #---------------------------
-    if len(aaindex_df.columns) == expected_aa_ncols:
-        print('\nPASS: successfully selected numerical columns only for aaindex_df')
-    else:
-        print('\nFAIL: Numbers mismatch'
-              , '\nExpected ncols:', expected_aa_ncols
-              , '\nGot:', len(aaindex_df.columns))    
-        
-    #---------------
-    # check for NA
-    #---------------
-    print('\nNow checking for NA in the remaining aaindex_cols')
-    c1 = aaindex_df.isna().sum()
-    c2 = c1.sort_values(ascending=False)
-    print('\nCounting aaindex_df cols with NA'
-          , '\nncols with NA:', sum(c2>0), 'columns'
-          , '\nDropping these...'
-          , '\nOriginal ncols:', len(aaindex_df.columns)
-          )
-    aa_df = aaindex_df.dropna(axis=1)
-    
-    print('\nRevised df ncols:', len(aa_df.columns))
-    
-    c3 = aa_df.isna().sum()
-    c4 = c3.sort_values(ascending=False)
-    
-    print('\nChecking NA in revised df...')
-    
-    if sum(c4>0):
-        sys.exit('\nFAIL: aaindex_df still contains cols with NA, please check and drop these before proceeding...')
-    else:
-        print('\nPASS: cols with NA successfully dropped from aaindex_df'
-              , '\nProceeding with combining aa_df with other features_df')
-        
-    #---------------------------
-    # aaindex: sanity check 2
-    #---------------------------
-    expected_aa_ncols2 =  len(aaindex_df.columns) - sum(c2>0)  
-    if len(aa_df.columns) == expected_aa_ncols2:
-        print('\nPASS: ncols match'
-              , '\nExpected ncols:', expected_aa_ncols2
-              , '\nGot:', len(aa_df.columns))
-    else:
-        print('\nFAIL: Numbers mismatch'
-              , '\nExpected ncols:', expected_aa_ncols2
-              , '\nGot:', len(aa_df.columns))            
-        
-    # Important: need this to identify aaindex cols    
-    aa_df_cols = aa_df.columns
-    print('\nTotal no. of columns in clean aa_df:', len(aa_df_cols))
-    
-    ###############################################################################
-    #%% Combining my_features_df and aaindex_df
-    #===========================
-    # Merge my_df + aaindex_df
-    #===========================
-    
-    if aa_df.columns[aa_df.columns.isin(my_features_df.columns)] == my_features_df.columns[my_features_df.columns.isin(aa_df.columns)]:
-        print('\nMerging on column: mutationinformation')   
-    
-    if len(my_features_df) == len(aa_df):
-        expected_nrows = len(my_features_df)
-        print('\nProceeding to merge, expected nrows in merged_df:', expected_nrows)
-    else:
-        sys.exit('\nNrows mismatch, cannot merge. Please check'
-              , '\nnrows my_df:', len(my_features_df)
-              , '\nnrows aa_df:', len(aa_df))
-               
-    #-----------------
-    # Reset index: mutationinformation
-    # Very important for merging
-    #-----------------
-    aa_df = aa_df.reset_index()
-    
-    expected_ncols = len(my_features_df.columns) + len(aa_df.columns) - 1 # for the no. of merging col
-    
-    #-----------------
-    # Merge: my_features_df + aa_df
-    #-----------------
-    merged_df = pd.merge(my_features_df
-                         , aa_df
-                         , on = 'mutationinformation')
-    
-    #---------------------------
-    # aaindex: sanity check 3
-    #---------------------------
-    if len(merged_df.columns) == expected_ncols:
-        print('\nPASS: my_features_df and aa_df successfully combined'
-              , '\nnrows:', len(merged_df)
-              , '\nncols:', len(merged_df.columns))
-    else:
-        sys.exit('\nFAIL: could not combine my_features_df and aa_df'
-                 , '\nCheck dims and merging cols!')
-        
-    #--------
-    # Reassign so downstream code doesn't need to change
-    #--------
-    my_df = merged_df.copy()
-    
-    #%% Data: my_df
-    # Check if non structural pos have crept in
-    # IDEALLY remove from source! But for rpoB do it here
-    # Drop NA where numerical cols have them
-    if gene.lower() in geneL_na_ppi2:
-        #D1148 get rid of
-        na_index = my_df['mutationinformation'].index[my_df['mcsm_na_affinity'].apply(np.isnan)]
-        my_df = my_df.drop(index=na_index)
-    
-    # FIXED: complete data for all muts inc L114M, F115L, V123L, V125I, V131M
-    # if gene.lower() in ['embb']:
-    #     na_index = my_df['mutationinformation'].index[my_df['ligand_distance'].apply(np.isnan)]
-    #     my_df = my_df.drop(index=na_index)
-    
-    # # Sanity check for non-structural positions
-    # print('\nChecking for non-structural postions')
-    # na_index = my_df['mutationinformation'].index[my_df['ligand_distance'].apply(np.isnan)]
-    # if len(na_index) > 0:
-    #     print('\nNon-structural positions detected for gene:', gene.lower()
-    #           , '\nTotal number of these detected:', len(na_index)
-    #           , '\These are at index:', na_index
-    #           , '\nOriginal nrows:', len(my_df)
-    #           , '\nDropping these...')
-    #     my_df = my_df.drop(index=na_index)
-    #     print('\nRevised nrows:', len(my_df))
-    # else:
-    #     print('\nNo non-structural positions detected for gene:', gene.lower()
-    #           , '\nnrows:', len(my_df))
-              
-    
-    ###########################################################################
-    #%% Add lineage calculation columns
-    #FIXME: Check if this can be imported from config?
-    total_mtblineage_uc = 8
-    lineage_colnames = ['lineage_list_all', 'lineage_count_all', 'lineage_count_unique', 'lineage_list_unique', 'lineage_multimode']
-    #bar = my_df[lineage_colnames]
-    my_df['lineage_proportion']      = my_df['lineage_count_unique']/my_df['lineage_count_all']
-    my_df['dist_lineage_proportion'] = my_df['lineage_count_unique']/total_mtblineage_uc
-    ###########################################################################
-    #%% Active site annotation column
-    # change from numberic to categorical
-    
-    if my_df['active_site'].dtype in num_type:
-        my_df['active_site'] = my_df['active_site'].astype(object)
-        my_df['active_site'].dtype
-    #%% AA property change
-    #--------------------
-    # Water prop change
-    #--------------------
-    my_df['water_change'] = my_df['wt_prop_water'] + str('_to_') + my_df['mut_prop_water']
-    my_df['water_change'].value_counts()
-    
-    water_prop_changeD = {
-        'hydrophobic_to_neutral'          : 'change'
-        , 'hydrophobic_to_hydrophobic'    : 'no_change'
-        , 'neutral_to_neutral'            : 'no_change'
-        , 'neutral_to_hydrophobic'        : 'change'
-        , 'hydrophobic_to_hydrophilic'    : 'change'
-        , 'neutral_to_hydrophilic'        : 'change'
-        , 'hydrophilic_to_neutral'        : 'change'
-        , 'hydrophilic_to_hydrophobic'    : 'change'
-        , 'hydrophilic_to_hydrophilic'    : 'no_change'
-    }
-    
-    my_df['water_change'] = my_df['water_change'].map(water_prop_changeD)
-    my_df['water_change'].value_counts()
-    
-    #--------------------
-    # Polarity change
-    #--------------------
-    my_df['polarity_change'] = my_df['wt_prop_polarity'] + str('_to_') + my_df['mut_prop_polarity']
-    my_df['polarity_change'].value_counts()
-    
-    polarity_prop_changeD = {
-        'non-polar_to_non-polar'     : 'no_change'
-        , 'non-polar_to_neutral'     : 'change'  
-        , 'neutral_to_non-polar'     : 'change'  
-        , 'neutral_to_neutral'       : 'no_change'  
-        , 'non-polar_to_basic'       : 'change'  
-        , 'acidic_to_neutral'        : 'change'  
-        , 'basic_to_neutral'         : 'change'  
-        , 'non-polar_to_acidic'      : 'change'  
-        , 'neutral_to_basic'         : 'change'  
-        , 'acidic_to_non-polar'      : 'change'  
-        , 'basic_to_non-polar'       : 'change'
-        , 'neutral_to_acidic'        : 'change'
-        , 'acidic_to_acidic'         : 'no_change'
-        , 'basic_to_acidic'          : 'change'
-        , 'basic_to_basic'           : 'no_change'
-        , 'acidic_to_basic'          : 'change'}
-    
-    my_df['polarity_change'] = my_df['polarity_change'].map(polarity_prop_changeD)
-    my_df['polarity_change'].value_counts()
-    
-    #--------------------
-    # Electrostatics change
-    #--------------------
-    my_df['electrostatics_change'] = my_df['wt_calcprop'] + str('_to_') + my_df['mut_calcprop']
-    my_df['electrostatics_change'].value_counts()
-    
-    calc_prop_changeD = {
-            'non-polar_to_non-polar'     : 'no_change'
-            , 'non-polar_to_polar'       : 'change'
-            , 'polar_to_non-polar'       : 'change'
-            , 'non-polar_to_pos'         : 'change'
-            , 'neg_to_non-polar'         : 'change'
-            , 'non-polar_to_neg'         : 'change'
-            , 'pos_to_polar'             : 'change'
-            , 'pos_to_non-polar'         : 'change'
-            , 'polar_to_polar'           : 'no_change'
-            , 'neg_to_neg'               : 'no_change'
-            , 'polar_to_neg'             : 'change'
-            , 'pos_to_neg'               : 'change'
-            , 'pos_to_pos'               : 'no_change'
-            , 'polar_to_pos'             : 'change'
-            , 'neg_to_polar'             : 'change'
-            , 'neg_to_pos'               : 'change'
-    }
-    
-    my_df['electrostatics_change'] = my_df['electrostatics_change'].map(calc_prop_changeD)
-    my_df['electrostatics_change'].value_counts()
-    
-    #--------------------    
-    # Summary change: Create a combined column summarising these three cols
-    #--------------------
-    detect_change = 'change'
-    check_prop_cols = ['water_change', 'polarity_change', 'electrostatics_change']
-    #my_df['aa_prop_change'] = (my_df.values == detect_change).any(1).astype(int)
-    my_df['aa_prop_change'] = (my_df[check_prop_cols].values == detect_change).any(1).astype(int)
-    my_df['aa_prop_change'].value_counts()
-    my_df['aa_prop_change'].dtype
-    
-    my_df['aa_prop_change'] = my_df['aa_prop_change'].map({1:'change'
-                                                           , 0: 'no_change'})
-    
-    my_df['aa_prop_change'].value_counts()
-    my_df['aa_prop_change'].dtype
-    
-    #%% IMPUTE values for OR [check script for exploration: UQ_or_imputer]
-    #--------------------
-    # Impute OR values
-    #--------------------
-    #or_cols = ['or_mychisq', 'log10_or_mychisq', 'or_fisher']
-    sel_cols = ['mutationinformation', 'or_mychisq', 'log10_or_mychisq']
-    or_cols = ['or_mychisq', 'log10_or_mychisq']
-    
-    print("count of NULL values before imputation\n")
-    print(my_df[or_cols].isnull().sum())
-    
-    my_dfI = pd.DataFrame(index = my_df['mutationinformation'] )
-    
-        
-    my_dfI = pd.DataFrame(KNN(n_neighbors=3, weights="uniform").fit_transform(my_df[or_cols])
-                          , index =  my_df['mutationinformation']
-                          , columns = or_cols )
-    my_dfI.columns = ['or_rawI', 'logorI']
-    my_dfI.columns
-    my_dfI = my_dfI.reset_index(drop = False) # prevents old index from being added as a column
-    my_dfI.head()
-    print("count of NULL values AFTER imputation\n")
-    print(my_dfI.isnull().sum())
-    
-    #-------------------------------------------
-    # OR df Merge: with original based on index
-    #-------------------------------------------
-    #my_df['index_bm'] = my_df.index
-    mydf_imputed = pd.merge(my_df
-                        , my_dfI
-                        , on = 'mutationinformation')
-    #mydf_imputed = mydf_imputed.set_index(['index_bm'])
-    
-    my_df['log10_or_mychisq'].isna().sum()
-    mydf_imputed['log10_or_mychisq'].isna().sum()
-    mydf_imputed['logorI'].isna().sum() # should be 0
-    
-    len(my_df.columns)
-    len(mydf_imputed.columns)  
-    
-    #-----------------------------------------
-    # REASSIGN my_df after imputing OR values
-    #-----------------------------------------
-    my_df = mydf_imputed.copy()
-    
-    if my_df['logorI'].isna().sum() == 0:
-        print('\nPASS: OR values imputed, data ready for ML')
-    else:
-        sys.exit('\nFAIL: something went wrong, Data not ready for ML. Please check upstream!')
-    
-    #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-    #---------------------------------------
-    # TODO: try other imputation like MICE
-    #---------------------------------------
-    #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-    
-    #%%########################################################################
-    #==========================
-    #     Data for ML
-    #==========================
-    my_df_ml = my_df.copy()
-    
-    # Build column names to mask for affinity chanhes
-    if gene.lower() in geneL_basic:
-        #X_stabilityN = common_cols_stabiltyN
-        gene_affinity_colnames = []# not needed as its the common ones 
-        cols_to_mask = ['ligand_affinity_change']
-        
-    if gene.lower() in geneL_ppi2:
-        gene_affinity_colnames = ['mcsm_ppi2_affinity', 'interface_dist'] 
-        #X_stabilityN = common_cols_stabiltyN + geneL_ppi2_st_cols
-        cols_to_mask = ['ligand_affinity_change', 'mcsm_ppi2_affinity']
-    
-    if gene.lower() in geneL_na:
-        gene_affinity_colnames =  ['mcsm_na_affinity'] 
-        #X_stabilityN = common_cols_stabiltyN + geneL_na_st_cols
-        cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity']
-    
-    if gene.lower() in geneL_na_ppi2:
-        gene_affinity_colnames = ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] 
-        #X_stabilityN = common_cols_stabiltyN + geneL_na_ppi2_st_cols
-        cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity', 'mcsm_ppi2_affinity']
-    
-    #=======================
-    # Masking columns:
-    # (mCSM-lig, mCSM-NA, mCSM-ppi2) values for lig_dist >10
-    #=======================
-    my_df_ml['mutationinformation'][my_df_ml['ligand_distance']>10].value_counts()
-    my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts()
-    my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask].value_counts()
-    
-    # mask the mcsm affinity related columns where ligand distance > 10
-    my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask] = 0
-    (my_df_ml['ligand_affinity_change'] == 0).sum()
-    
-    mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask]  
-    
-    #===================================================
-    # write file for check
-    mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
-    mask_check.to_csv(outdir + 'ml/' + gene.lower() + '_mask_check.csv')
-    #===================================================
-    ###############################################################################
-    #%% Feature groups (FG): Build X for Input ML 
-    ############################################################################
-    #===========================
-    # FG1: Evolutionary features
-    #===========================
-    X_evolFN =  ['consurf_score'
-               , 'snap2_score'
-               , 'provean_score']
-    
-    ###############################################################################
-    #========================
-    # FG2: Stability features
-    #========================
-    #--------
-    # common
-    #--------
-    X_common_stability_Fnum = [
-               'duet_stability_change'
-               , 'ddg_foldx'
-               , 'deepddg'
-               , 'ddg_dynamut2'
-               , 'contacts']
-    #--------
-    # FoldX
-    #--------
-    X_foldX_Fnum = [ 'electro_rr', 'electro_mm', 'electro_sm', 'electro_ss'
-    , 'disulfide_rr', 'disulfide_mm', 'disulfide_sm', 'disulfide_ss'
-    , 'hbonds_rr', 'hbonds_mm', 'hbonds_sm', 'hbonds_ss'
-    , 'partcov_rr', 'partcov_mm', 'partcov_sm', 'partcov_ss'
-    , 'vdwclashes_rr', 'vdwclashes_mm', 'vdwclashes_sm', 'vdwclashes_ss'
-    , 'volumetric_rr', 'volumetric_mm', 'volumetric_ss']
-    
-    X_stability_FN = X_common_stability_Fnum + X_foldX_Fnum
-    
-    ###############################################################################
-    #===================
-    # FG3: Affinity features
-    #===================
-    common_affinity_Fnum =  ['ligand_distance'
-                    , 'ligand_affinity_change'
-                    , 'mmcsm_lig']
-    
-    # if gene.lower() in geneL_basic:
-    #     X_affinityFN = common_affinity_Fnum 
-    # else:
-    #     X_affinityFN = common_affinity_Fnum + gene_affinity_colnames
-        
-    X_affinityFN = common_affinity_Fnum + gene_affinity_colnames
-    
-    ###############################################################################
-    #============================
-    # FG4: Residue level features
-    #============================
-    #-----------
-    # AA index
-    #-----------
-    X_aaindex_Fnum = list(aa_df_cols)
-    print('\nTotal no. of features for aaindex:', len(X_aaindex_Fnum))
-    
-    #-----------------
-    # surface area
-    # depth
-    # hydrophobicity
-    #-----------------
-    X_str_Fnum =  ['rsa'
-               #, 'asa'
-               , 'kd_values'
-               , 'rd_values']   
-    
-    #---------------------------
-    # Other aa properties
-    # active site indication
-    #---------------------------
-    X_aap_Fcat = ['ss_class'
-                # , 'wt_prop_water'
-                # , 'mut_prop_water'
-                # , 'wt_prop_polarity'
-                # , 'mut_prop_polarity'
-                # , 'wt_calcprop'
-                # , 'mut_calcprop'
-                , 'aa_prop_change'
-                , 'electrostatics_change'
-                , 'polarity_change'
-                , 'water_change'
-                , 'active_site']
-       
-    X_resprop_FN = X_aaindex_Fnum + X_str_Fnum + X_aap_Fcat
-    ###############################################################################
-    #========================
-    # FG5: Genomic features
-    #========================
-    X_gn_mafor_Fnum =  ['maf'
-                    #, 'logorI'
-                    # , 'or_rawI'
-                    # , 'or_mychisq'
-                    # , 'or_logistic'
-                    # , 'or_fisher'
-                    # , 'pval_fisher'
-                    ]
-    
-    X_gn_linegae_Fnum  = ['lineage_proportion'
-                          , 'dist_lineage_proportion'
-                          #, 'lineage' # could be included as a category but it has L2;L4  formatting
-                          , 'lineage_count_all'
-                          , 'lineage_count_unique'
-                          ]
-    
-    # X_gn_Fcat = ['drtype_mode_labels'  # beware then you can't use it to predict [USED it for uq_v1, not v2]
-    #                #, 'gene_name' # will be required for the combined stuff
-    #              ]
-    X_gn_Fcat = []
-    
-    X_genomicFN = X_gn_mafor_Fnum + X_gn_linegae_Fnum + X_gn_Fcat
-    ###############################################################################
-    #========================
-    # FG6 collapsed: Structural : Atability + Affinity + ResidueProp
-    #========================
-    X_structural_FN =  X_stability_FN + X_affinityFN + X_resprop_FN
-    
-    ###############################################################################
-    #========================
-    # BUILDING all features
-    #========================
-    all_featuresN = X_evolFN + X_structural_FN + X_genomicFN
-    
-    ###############################################################################
-    #%% Define training and test data
-    #================================================================
-    # Training and BLIND test set: 80/20
-    # dst with actual values  : training set
-    # dst with imputed values : THROW AWAY [unrepresentative]
-    #================================================================
-    my_df_ml[drug].isna().sum()
-    
-    #    blind_test_df = my_df_ml[my_df_ml[drug].isna()]
-    #    blind_test_df.shape
-    
-    #training_df = my_df_ml[my_df_ml[drug].notna()]
-    #training_df.shape
-    
-    training_df = my_df_ml.copy()
-    
-    # Target 1: dst_mode
-    training_df[drug].value_counts()
-    training_df['dst_mode'].value_counts()
-    
-    ####################################################################
-    #====================================
-    # ML data: Train test split: 80/20
-    # with stratification
-    # 80% : training_data for CV
-    # 20% : blind test 
-    #=====================================
-    x_features = training_df[all_featuresN]
-    y_target   = training_df['dst_mode']
-    
-    # sanity check
-    if not 'dst_mode' in x_features.columns:
-        print('\nPASS: x_features has no target variable')
-        x_ncols = len(x_features.columns)
-        print('\nNo. of columns for x_features:', x_ncols)
-        # NEED It for scaling law split
-        #https://towardsdatascience.com/finally-why-we-use-an-80-20-split-for-training-and-test-data-plus-an-alternative-method-oh-yes-edc77e96295d
-    else:
-        sys.exit('\nFAIL: x_features has target variable included. FIX it and rerun!')
-    #-------------------
-    # train-test split
-    #-------------------
-    #x_train, x_test, y_train, y_test # traditional var_names
-    # so my downstream code doesn't need to change    
-    X, X_bts, y, y_bts = train_test_split(x_features, y_target
-                                                    , test_size = 0.2
-                                                    , **rs
-                                                    , stratify = y_target)
-    yc1 = Counter(y)
-    yc1_ratio = yc1[0]/yc1[1]
-    
-    yc2 = Counter(y_bts)
-    yc2_ratio = yc2[0]/yc2[1]
-    
-    ###############################################################################
-    #======================================================
-    # Determine categorical and numerical features
-    #======================================================
-    numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
-    numerical_cols 
-    categorical_cols = X.select_dtypes(include=['object', 'bool']).columns
-    categorical_cols 
-    
-    ################################################################################
-    # IMPORTANT sanity checks
-    if len(X.columns) == len(X_evolFN) + len(X_stability_FN) + len(X_affinityFN) + len(X_resprop_FN) + len(X_genomicFN):
-        print('\nPASS: ML data with input features, training and test generated...'
-              , '\n\nTotal no. of input features:'        , len(X.columns)
-              , '\n--------No. of numerical features:'    , len(numerical_cols)
-              , '\n--------No. of categorical features:'  , len(categorical_cols)
-              
-              , '\n\nTotal no. of evolutionary features:' , len(X_evolFN)
-              
-              , '\n\nTotal no. of stability features:'    , len(X_stability_FN)
-              , '\n--------Common stabilty cols:'         , len(X_common_stability_Fnum)
-              , '\n--------Foldx cols:'                   , len(X_foldX_Fnum)
-              
-              , '\n\nTotal no. of affinity features:'     , len(X_affinityFN)
-              , '\n--------Common affinity cols:'         , len(common_affinity_Fnum)
-              , '\n--------Gene specific affinity cols:'  , len(gene_affinity_colnames)
-              
-              , '\n\nTotal no. of residue level features:', len(X_resprop_FN)
-              , '\n--------AA index cols:'                , len(X_aaindex_Fnum)
-              , '\n--------Residue Prop cols:'            , len(X_str_Fnum)
-              , '\n--------AA change Prop cols:'          , len(X_aap_Fcat)
-              
-              , '\n\nTotal no. of genomic features:'      , len(X_genomicFN)
-              , '\n--------MAF+OR cols:'                  , len(X_gn_mafor_Fnum)
-              , '\n--------Lineage cols:'                 , len(X_gn_linegae_Fnum)
-              , '\n--------Other cols:'                   , len(X_gn_Fcat)
-              )
-    else:
-        print('\nFAIL: numbers mismatch'
-              , '\nExpected:',len(X_evolFN) + len(X_stability_FN) + len(X_affinityFN) + len(X_resprop_FN) + len(X_genomicFN)
-              , '\nGot:', len(X.columns))
-        sys.exit()
-    ###############################################################################
-    print('\n-------------------------------------------------------------'
-          , '\nSuccessfully split data: ALL features'
-          , '\nactual values: training set'
-          ,  '\nSplit:', tts_split
-          #, '\nimputed values: blind test set'
-          
-          , '\n\nTotal data size:', len(X) + len(X_bts)
-    
-          , '\n\nTrain data size:', X.shape
-          , '\ny_train numbers:', yc1
-    
-          , '\n\nTest data size:', X_bts.shape
-          , '\ny_test_numbers:', yc2
-    
-          , '\n\ny_train ratio:',yc1_ratio
-          , '\ny_test ratio:', yc2_ratio
-          , '\n-------------------------------------------------------------'
-          )
-    ##########################################################################    
-    # Quick check
-    #(X['ligand_affinity_change']==0).sum() == (X['ligand_distance']>10).sum()
-    for i in range(len(cols_to_mask)):
-        ind = i+1
-        print('\nindex:', i, '\nind:', ind)
-        print('\nMask count check:'
-              , (my_df_ml[cols_to_mask[i]]==0).sum() == (my_df_ml['ligand_distance']>10).sum()
-              )
-    
-    print('Original Data\n', Counter(y)
-          , 'Data dim:', X.shape)
-    ###########################################################################
-    #%% 
-    ###########################################################################
-    #                               RESAMPLING
-    ###########################################################################
-    #------------------------------
-    # Simple Random oversampling
-    # [Numerical + catgeorical]
-    #------------------------------
-    oversample = RandomOverSampler(sampling_strategy='minority')
-    X_ros, y_ros = oversample.fit_resample(X, y)
-    print('\nSimple Random OverSampling\n', Counter(y_ros))
-    print(X_ros.shape)
-    
-    #------------------------------
-    # Simple Random Undersampling
-    # [Numerical + catgeorical]
-    #------------------------------
-    undersample = RandomUnderSampler(sampling_strategy='majority')
-    X_rus, y_rus = undersample.fit_resample(X, y)
-    print('\nSimple Random UnderSampling\n', Counter(y_rus))
-    print(X_rus.shape)
-    
-    #------------------------------
-    # Simple combine ROS and RUS
-    # [Numerical + catgeorical]
-    #------------------------------
-    oversample = RandomOverSampler(sampling_strategy='minority')
-    X_ros, y_ros = oversample.fit_resample(X, y)
-    undersample = RandomUnderSampler(sampling_strategy='majority')
-    X_rouC, y_rouC = undersample.fit_resample(X_ros, y_ros)
-    print('\nSimple Combined Over and UnderSampling\n',  Counter(y_rouC))
-    print(X_rouC.shape)
-    
-    #------------------------------
-    # SMOTE_NC: oversampling 
-    # [numerical + categorical]
-    #https://stackoverflow.com/questions/47655813/oversampling-smote-for-binary-and-categorical-data-in-python
-    #------------------------------
-    # Determine categorical and numerical features
-    numerical_ix = X.select_dtypes(include=['int64', 'float64']).columns
-    numerical_ix
-    num_featuresL = list(numerical_ix)
-    numerical_colind = X.columns.get_indexer(list(numerical_ix) )
-    numerical_colind
-    
-    categorical_ix = X.select_dtypes(include=['object', 'bool']).columns
-    categorical_ix    
-    categorical_colind = X.columns.get_indexer(list(categorical_ix))
-    categorical_colind
-    
-    k_sm = 5 # 5 is default
-    sm_nc = SMOTENC(categorical_features=categorical_colind, k_neighbors = k_sm, **rs, **njobs)
-    X_smnc, y_smnc = sm_nc.fit_resample(X, y)
-    print('\nSMOTE_NC OverSampling\n', Counter(y_smnc))
-    print(X_smnc.shape)
-    globals().update(locals()) # TROLOLOLOLOLOLS
-    #print("i did a horrible hack :-)")
-    ###############################################################################
-    #%% SMOTE RESAMPLING for NUMERICAL ONLY*
-    # #------------------------------
-    # # SMOTE: Oversampling
-    # # [Numerical ONLY]
-    # #------------------------------
-    # k_sm = 1
-    # sm = SMOTE(sampling_strategy = 'auto', k_neighbors = k_sm, **rs)
-    # X_sm, y_sm = sm.fit_resample(X, y)
-    # print(X_sm.shape)
-    # print('\nSMOTE OverSampling\n', Counter(y_sm))
-    # y_sm_df = y_sm.to_frame()
-    # y_sm_df.value_counts().plot(kind = 'bar')
-    
-    # #------------------------------
-    # # SMOTE: Over + Undersampling COMBINED
-    # # [Numerical ONLY]
-    # #-----------------------------
-    # sm_enn = SMOTEENN(enn=EditedNearestNeighbours(sampling_strategy='all', **rs, **njobs ))
-    # X_enn, y_enn = sm_enn.fit_resample(X, y)
-    # print(X_enn.shape)
-    # print('\nSMOTE Over+Under Sampling combined\n', Counter(y_enn))
-    
-    ###############################################################################
-    # TODO: Find over and undersampling JUST for categorical data
-        ###########################################################################
-    
-    print('\n#################################################################'
-          , '\nDim of X for gene:', gene.lower(), '\n',  X.shape
-          , '\n###############################################################')
--- a/scripts/ml/ml_data_cd_sl.py
+++ b/scripts/ml/ml_data_cd_sl.py
@ -1,814 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Sun Mar  6 13:41:54 2022
-
-@author: tanu
-"""
-def setvars(gene,drug):
-    #https://stackoverflow.com/questions/51695322/compare-multiple-algorithms-with-sklearn-pipeline
-    import os, sys
-    import pandas as pd
-    import numpy as np
-    print(np.__version__)
-    print(pd.__version__)
-    import pprint as pp
-    from copy import deepcopy
-    from collections import Counter
-    from sklearn.impute import KNNImputer as KNN
-    from imblearn.over_sampling import RandomOverSampler
-    from imblearn.under_sampling import RandomUnderSampler
-    from imblearn.over_sampling import SMOTE
-    from sklearn.datasets import make_classification
-    from imblearn.combine import SMOTEENN
-    from imblearn.combine import SMOTETomek
-    
-    from imblearn.over_sampling import SMOTENC
-    from imblearn.under_sampling import EditedNearestNeighbours
-    from imblearn.under_sampling import RepeatedEditedNearestNeighbours
-    
-    from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score
-    from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report
-    
-    from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
-    from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold
-    
-    from sklearn.pipeline import Pipeline, make_pipeline
-    import argparse
-    import re
-    #%% GLOBALS
-    tts_split = "sl"
-
-    rs = {'random_state': 42}
-    njobs = {'n_jobs': 10}
-    
-    scoring_fn =  ({ 'mcc'         : make_scorer(matthews_corrcoef)
-                    , 'accuracy'   : make_scorer(accuracy_score)
-                    , 'fscore'     : make_scorer(f1_score)
-                    , 'precision'  : make_scorer(precision_score)
-                    , 'recall'     : make_scorer(recall_score)
-                    , 'roc_auc'    : make_scorer(roc_auc_score)
-                    , 'jcc'        : make_scorer(jaccard_score)
-                }) 
-      
-    skf_cv = StratifiedKFold(n_splits = 10
-                              #, shuffle = False, random_state= None)
-                               , shuffle = True,**rs)
-    
-    rskf_cv = RepeatedStratifiedKFold(n_splits = 10
-                                      , n_repeats = 3
-                                      , **rs)
-    
-    mcc_score_fn  = {'mcc': make_scorer(matthews_corrcoef)}
-    jacc_score_fn = {'jcc': make_scorer(jaccard_score)}   
-    #%% FOR LATER: Combine ED logo data
-    ###########################################################################
-
-    homedir = os.path.expanduser("~")
-    
-    geneL_basic     = ['pnca']
-    geneL_na        = ['gid']
-    geneL_na_ppi2   = ['rpob']
-    geneL_ppi2      = ['alr', 'embb', 'katg']
-    
-    #num_type = ['int64', 'float64']
-    num_type = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
-    cat_type = ['object', 'bool']
-    
-    #==============
-    # directories
-    #==============
-    datadir = homedir + '/git/Data/'
-    indir   = datadir + drug + '/input/'
-    outdir  = datadir + drug + '/output/'
-    
-    #=======
-    # input
-    #=======
-    
-    #---------
-    # File 1
-    #---------
-    infile_ml1 = outdir + gene.lower() + '_merged_df3.csv' 
-    #infile_ml2 = outdir + gene.lower() + '_merged_df2.csv'
-    
-    my_features_df = pd.read_csv(infile_ml1, index_col = 0) 
-    my_features_df  = my_features_df .reset_index(drop = True)
-    my_features_df.index
-    
-    my_features_df.dtypes
-    mycols = my_features_df.columns
-    
-    #---------
-    # File 2
-    #---------
-    infile_aaindex = outdir + 'aa_index/' + gene.lower() + '_aa.csv' 
-    aaindex_df = pd.read_csv(infile_aaindex, index_col = 0) 
-    aaindex_df.dtypes
-    
-    #-----------
-    # check for non-numerical columns
-    #-----------
-    if any(aaindex_df.dtypes==object):
-        print('\naaindex_df contains non-numerical data')
-    
-    aaindex_df_object = aaindex_df.select_dtypes(include = cat_type)
-    print('\nTotal no. of non-numerial columns:', len(aaindex_df_object.columns))
-    
-    expected_aa_ncols = len(aaindex_df.columns) - len(aaindex_df_object.columns)
-    
-    #-----------
-    # Extract numerical data only
-    #-----------
-    print('\nSelecting numerical data only')
-    aaindex_df = aaindex_df.select_dtypes(include = num_type)
-    
-    #---------------------------
-    # aaindex: sanity check 1
-    #---------------------------
-    if len(aaindex_df.columns) == expected_aa_ncols:
-        print('\nPASS: successfully selected numerical columns only for aaindex_df')
-    else:
-        print('\nFAIL: Numbers mismatch'
-              , '\nExpected ncols:', expected_aa_ncols
-              , '\nGot:', len(aaindex_df.columns))    
-        
-    #---------------
-    # check for NA
-    #---------------
-    print('\nNow checking for NA in the remaining aaindex_cols')
-    c1 = aaindex_df.isna().sum()
-    c2 = c1.sort_values(ascending=False)
-    print('\nCounting aaindex_df cols with NA'
-          , '\nncols with NA:', sum(c2>0), 'columns'
-          , '\nDropping these...'
-          , '\nOriginal ncols:', len(aaindex_df.columns)
-          )
-    aa_df = aaindex_df.dropna(axis=1)
-    
-    print('\nRevised df ncols:', len(aa_df.columns))
-    
-    c3 = aa_df.isna().sum()
-    c4 = c3.sort_values(ascending=False)
-    
-    print('\nChecking NA in revised df...')
-    
-    if sum(c4>0):
-        sys.exit('\nFAIL: aaindex_df still contains cols with NA, please check and drop these before proceeding...')
-    else:
-        print('\nPASS: cols with NA successfully dropped from aaindex_df'
-              , '\nProceeding with combining aa_df with other features_df')
-        
-    #---------------------------
-    # aaindex: sanity check 2
-    #---------------------------
-    expected_aa_ncols2 =  len(aaindex_df.columns) - sum(c2>0)  
-    if len(aa_df.columns) == expected_aa_ncols2:
-        print('\nPASS: ncols match'
-              , '\nExpected ncols:', expected_aa_ncols2
-              , '\nGot:', len(aa_df.columns))
-    else:
-        print('\nFAIL: Numbers mismatch'
-              , '\nExpected ncols:', expected_aa_ncols2
-              , '\nGot:', len(aa_df.columns))            
-        
-    # Important: need this to identify aaindex cols    
-    aa_df_cols = aa_df.columns
-    print('\nTotal no. of columns in clean aa_df:', len(aa_df_cols))
-    
-    ###############################################################################
-    #%% Combining my_features_df and aaindex_df
-    #===========================
-    # Merge my_df + aaindex_df
-    #===========================
-    
-    if aa_df.columns[aa_df.columns.isin(my_features_df.columns)] == my_features_df.columns[my_features_df.columns.isin(aa_df.columns)]:
-        print('\nMerging on column: mutationinformation')   
-    
-    if len(my_features_df) == len(aa_df):
-        expected_nrows = len(my_features_df)
-        print('\nProceeding to merge, expected nrows in merged_df:', expected_nrows)
-    else:
-        sys.exit('\nNrows mismatch, cannot merge. Please check'
-              , '\nnrows my_df:', len(my_features_df)
-              , '\nnrows aa_df:', len(aa_df))
-               
-    #-----------------
-    # Reset index: mutationinformation
-    # Very important for merging
-    #-----------------
-    aa_df = aa_df.reset_index()
-    
-    expected_ncols = len(my_features_df.columns) + len(aa_df.columns) - 1 # for the no. of merging col
-    
-    #-----------------
-    # Merge: my_features_df + aa_df
-    #-----------------
-    merged_df = pd.merge(my_features_df
-                         , aa_df
-                         , on = 'mutationinformation')
-    
-    #---------------------------
-    # aaindex: sanity check 3
-    #---------------------------
-    if len(merged_df.columns) == expected_ncols:
-        print('\nPASS: my_features_df and aa_df successfully combined'
-              , '\nnrows:', len(merged_df)
-              , '\nncols:', len(merged_df.columns))
-    else:
-        sys.exit('\nFAIL: could not combine my_features_df and aa_df'
-                 , '\nCheck dims and merging cols!')
-        
-    #--------
-    # Reassign so downstream code doesn't need to change
-    #--------
-    my_df = merged_df.copy()
-    
-    #%% Data: my_df
-    # Check if non structural pos have crept in
-    # IDEALLY remove from source! But for rpoB do it here
-    # Drop NA where numerical cols have them
-    if gene.lower() in geneL_na_ppi2:
-        #D1148 get rid of
-        na_index = my_df['mutationinformation'].index[my_df['mcsm_na_affinity'].apply(np.isnan)]
-        my_df = my_df.drop(index=na_index)
-    
-    # FIXED: complete data for all muts inc L114M, F115L, V123L, V125I, V131M
-    # if gene.lower() in ['embb']:
-    #     na_index = my_df['mutationinformation'].index[my_df['ligand_distance'].apply(np.isnan)]
-    #     my_df = my_df.drop(index=na_index)
-    
-    # # Sanity check for non-structural positions
-    # print('\nChecking for non-structural postions')
-    # na_index = my_df['mutationinformation'].index[my_df['ligand_distance'].apply(np.isnan)]
-    # if len(na_index) > 0:
-    #     print('\nNon-structural positions detected for gene:', gene.lower()
-    #           , '\nTotal number of these detected:', len(na_index)
-    #           , '\These are at index:', na_index
-    #           , '\nOriginal nrows:', len(my_df)
-    #           , '\nDropping these...')
-    #     my_df = my_df.drop(index=na_index)
-    #     print('\nRevised nrows:', len(my_df))
-    # else:
-    #     print('\nNo non-structural positions detected for gene:', gene.lower()
-    #           , '\nnrows:', len(my_df))
-              
-    
-    ###########################################################################
-    #%% Add lineage calculation columns
-    #FIXME: Check if this can be imported from config?
-    total_mtblineage_uc = 8
-    lineage_colnames = ['lineage_list_all', 'lineage_count_all', 'lineage_count_unique', 'lineage_list_unique', 'lineage_multimode']
-    #bar = my_df[lineage_colnames]
-    my_df['lineage_proportion']      = my_df['lineage_count_unique']/my_df['lineage_count_all']
-    my_df['dist_lineage_proportion'] = my_df['lineage_count_unique']/total_mtblineage_uc
-    ###########################################################################
-    #%% Active site annotation column
-    # change from numberic to categorical
-    
-    if my_df['active_site'].dtype in num_type:
-        my_df['active_site'] = my_df['active_site'].astype(object)
-        my_df['active_site'].dtype
-    #%% AA property change
-    #--------------------
-    # Water prop change
-    #--------------------
-    my_df['water_change'] = my_df['wt_prop_water'] + str('_to_') + my_df['mut_prop_water']
-    my_df['water_change'].value_counts()
-    
-    water_prop_changeD = {
-        'hydrophobic_to_neutral'          : 'change'
-        , 'hydrophobic_to_hydrophobic'    : 'no_change'
-        , 'neutral_to_neutral'            : 'no_change'
-        , 'neutral_to_hydrophobic'        : 'change'
-        , 'hydrophobic_to_hydrophilic'    : 'change'
-        , 'neutral_to_hydrophilic'        : 'change'
-        , 'hydrophilic_to_neutral'        : 'change'
-        , 'hydrophilic_to_hydrophobic'    : 'change'
-        , 'hydrophilic_to_hydrophilic'    : 'no_change'
-    }
-    
-    my_df['water_change'] = my_df['water_change'].map(water_prop_changeD)
-    my_df['water_change'].value_counts()
-    
-    #--------------------
-    # Polarity change
-    #--------------------
-    my_df['polarity_change'] = my_df['wt_prop_polarity'] + str('_to_') + my_df['mut_prop_polarity']
-    my_df['polarity_change'].value_counts()
-    
-    polarity_prop_changeD = {
-        'non-polar_to_non-polar'     : 'no_change'
-        , 'non-polar_to_neutral'     : 'change'  
-        , 'neutral_to_non-polar'     : 'change'  
-        , 'neutral_to_neutral'       : 'no_change'  
-        , 'non-polar_to_basic'       : 'change'  
-        , 'acidic_to_neutral'        : 'change'  
-        , 'basic_to_neutral'         : 'change'  
-        , 'non-polar_to_acidic'      : 'change'  
-        , 'neutral_to_basic'         : 'change'  
-        , 'acidic_to_non-polar'      : 'change'  
-        , 'basic_to_non-polar'       : 'change'
-        , 'neutral_to_acidic'        : 'change'
-        , 'acidic_to_acidic'         : 'no_change'
-        , 'basic_to_acidic'          : 'change'
-        , 'basic_to_basic'           : 'no_change'
-        , 'acidic_to_basic'          : 'change'}
-    
-    my_df['polarity_change'] = my_df['polarity_change'].map(polarity_prop_changeD)
-    my_df['polarity_change'].value_counts()
-    
-    #--------------------
-    # Electrostatics change
-    #--------------------
-    my_df['electrostatics_change'] = my_df['wt_calcprop'] + str('_to_') + my_df['mut_calcprop']
-    my_df['electrostatics_change'].value_counts()
-    
-    calc_prop_changeD = {
-            'non-polar_to_non-polar'     : 'no_change'
-            , 'non-polar_to_polar'       : 'change'
-            , 'polar_to_non-polar'       : 'change'
-            , 'non-polar_to_pos'         : 'change'
-            , 'neg_to_non-polar'         : 'change'
-            , 'non-polar_to_neg'         : 'change'
-            , 'pos_to_polar'             : 'change'
-            , 'pos_to_non-polar'         : 'change'
-            , 'polar_to_polar'           : 'no_change'
-            , 'neg_to_neg'               : 'no_change'
-            , 'polar_to_neg'             : 'change'
-            , 'pos_to_neg'               : 'change'
-            , 'pos_to_pos'               : 'no_change'
-            , 'polar_to_pos'             : 'change'
-            , 'neg_to_polar'             : 'change'
-            , 'neg_to_pos'               : 'change'
-    }
-    
-    my_df['electrostatics_change'] = my_df['electrostatics_change'].map(calc_prop_changeD)
-    my_df['electrostatics_change'].value_counts()
-    
-    #--------------------    
-    # Summary change: Create a combined column summarising these three cols
-    #--------------------
-    detect_change = 'change'
-    check_prop_cols = ['water_change', 'polarity_change', 'electrostatics_change']
-    #my_df['aa_prop_change'] = (my_df.values == detect_change).any(1).astype(int)
-    my_df['aa_prop_change'] = (my_df[check_prop_cols].values == detect_change).any(1).astype(int)
-    my_df['aa_prop_change'].value_counts()
-    my_df['aa_prop_change'].dtype
-    
-    my_df['aa_prop_change'] = my_df['aa_prop_change'].map({1:'change'
-                                                           , 0: 'no_change'})
-    
-    my_df['aa_prop_change'].value_counts()
-    my_df['aa_prop_change'].dtype
-    
-    #%% IMPUTE values for OR [check script for exploration: UQ_or_imputer]
-    #--------------------
-    # Impute OR values
-    #--------------------
-    #or_cols = ['or_mychisq', 'log10_or_mychisq', 'or_fisher']
-    sel_cols = ['mutationinformation', 'or_mychisq', 'log10_or_mychisq']
-    or_cols = ['or_mychisq', 'log10_or_mychisq']
-    
-    print("count of NULL values before imputation\n")
-    print(my_df[or_cols].isnull().sum())
-    
-    my_dfI = pd.DataFrame(index = my_df['mutationinformation'] )
-    
-        
-    my_dfI = pd.DataFrame(KNN(n_neighbors=3, weights="uniform").fit_transform(my_df[or_cols])
-                          , index =  my_df['mutationinformation']
-                          , columns = or_cols )
-    my_dfI.columns = ['or_rawI', 'logorI']
-    my_dfI.columns
-    my_dfI = my_dfI.reset_index(drop = False) # prevents old index from being added as a column
-    my_dfI.head()
-    print("count of NULL values AFTER imputation\n")
-    print(my_dfI.isnull().sum())
-    
-    #-------------------------------------------
-    # OR df Merge: with original based on index
-    #-------------------------------------------
-    #my_df['index_bm'] = my_df.index
-    mydf_imputed = pd.merge(my_df
-                        , my_dfI
-                        , on = 'mutationinformation')
-    #mydf_imputed = mydf_imputed.set_index(['index_bm'])
-    
-    my_df['log10_or_mychisq'].isna().sum()
-    mydf_imputed['log10_or_mychisq'].isna().sum()
-    mydf_imputed['logorI'].isna().sum() # should be 0
-    
-    len(my_df.columns)
-    len(mydf_imputed.columns)  
-    
-    #-----------------------------------------
-    # REASSIGN my_df after imputing OR values
-    #-----------------------------------------
-    my_df = mydf_imputed.copy()
-    
-    if my_df['logorI'].isna().sum() == 0:
-        print('\nPASS: OR values imputed, data ready for ML')
-    else:
-        sys.exit('\nFAIL: something went wrong, Data not ready for ML. Please check upstream!')
-    
-    #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-    #---------------------------------------
-    # TODO: try other imputation like MICE
-    #---------------------------------------
-    #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-    
-    #%%########################################################################
-    #==========================
-    #     Data for ML
-    #==========================
-    my_df_ml = my_df.copy()
-    
-    # Build column names to mask for affinity chanhes
-    if gene.lower() in geneL_basic:
-        #X_stabilityN = common_cols_stabiltyN
-        gene_affinity_colnames = []# not needed as its the common ones 
-        cols_to_mask = ['ligand_affinity_change']
-        
-    if gene.lower() in geneL_ppi2:
-        gene_affinity_colnames = ['mcsm_ppi2_affinity', 'interface_dist'] 
-        #X_stabilityN = common_cols_stabiltyN + geneL_ppi2_st_cols
-        cols_to_mask = ['ligand_affinity_change', 'mcsm_ppi2_affinity']
-    
-    if gene.lower() in geneL_na:
-        gene_affinity_colnames =  ['mcsm_na_affinity'] 
-        #X_stabilityN = common_cols_stabiltyN + geneL_na_st_cols
-        cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity']
-    
-    if gene.lower() in geneL_na_ppi2:
-        gene_affinity_colnames = ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] 
-        #X_stabilityN = common_cols_stabiltyN + geneL_na_ppi2_st_cols
-        cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity', 'mcsm_ppi2_affinity']
-    
-    #=======================
-    # Masking columns:
-    # (mCSM-lig, mCSM-NA, mCSM-ppi2) values for lig_dist >10
-    #=======================
-    my_df_ml['mutationinformation'][my_df_ml['ligand_distance']>10].value_counts()
-    my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts()
-    my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask].value_counts()
-    
-    # mask the mcsm affinity related columns where ligand distance > 10
-    my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask] = 0
-    (my_df_ml['ligand_affinity_change'] == 0).sum()
-    
-    mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask]  
-    
-    #===================================================
-    # write file for check
-    mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
-    mask_check.to_csv(outdir + 'ml/' + gene.lower() + '_mask_check.csv')
-    #===================================================
-    ###############################################################################
-    #%% Feature groups (FG): Build X for Input ML 
-    ############################################################################
-    #===========================
-    # FG1: Evolutionary features
-    #===========================
-    X_evolFN =  ['consurf_score'
-               , 'snap2_score'
-               , 'provean_score']
-    
-    ###############################################################################
-    #========================
-    # FG2: Stability features
-    #========================
-    #--------
-    # common
-    #--------
-    X_common_stability_Fnum = [
-               'duet_stability_change'
-               , 'ddg_foldx'
-               , 'deepddg'
-               , 'ddg_dynamut2'
-               , 'contacts']
-    #--------
-    # FoldX
-    #--------
-    X_foldX_Fnum = [ 'electro_rr', 'electro_mm', 'electro_sm', 'electro_ss'
-    , 'disulfide_rr', 'disulfide_mm', 'disulfide_sm', 'disulfide_ss'
-    , 'hbonds_rr', 'hbonds_mm', 'hbonds_sm', 'hbonds_ss'
-    , 'partcov_rr', 'partcov_mm', 'partcov_sm', 'partcov_ss'
-    , 'vdwclashes_rr', 'vdwclashes_mm', 'vdwclashes_sm', 'vdwclashes_ss'
-    , 'volumetric_rr', 'volumetric_mm', 'volumetric_ss']
-    
-    X_stability_FN = X_common_stability_Fnum + X_foldX_Fnum
-    
-    ###############################################################################
-    #===================
-    # FG3: Affinity features
-    #===================
-    common_affinity_Fnum =  ['ligand_distance'
-                    , 'ligand_affinity_change'
-                    , 'mmcsm_lig']
-    
-    # if gene.lower() in geneL_basic:
-    #     X_affinityFN = common_affinity_Fnum 
-    # else:
-    #     X_affinityFN = common_affinity_Fnum + gene_affinity_colnames
-        
-    X_affinityFN = common_affinity_Fnum + gene_affinity_colnames
-    
-    ###############################################################################
-    #============================
-    # FG4: Residue level features
-    #============================
-    #-----------
-    # AA index
-    #-----------
-    X_aaindex_Fnum = list(aa_df_cols)
-    print('\nTotal no. of features for aaindex:', len(X_aaindex_Fnum))
-    
-    #-----------------
-    # surface area
-    # depth
-    # hydrophobicity
-    #-----------------
-    X_str_Fnum =  ['rsa'
-               #, 'asa'
-               , 'kd_values'
-               , 'rd_values']   
-    
-    #---------------------------
-    # Other aa properties
-    # active site indication
-    #---------------------------
-    X_aap_Fcat = ['ss_class'
-                # , 'wt_prop_water'
-                # , 'mut_prop_water'
-                # , 'wt_prop_polarity'
-                # , 'mut_prop_polarity'
-                # , 'wt_calcprop'
-                # , 'mut_calcprop'
-                , 'aa_prop_change'
-                , 'electrostatics_change'
-                , 'polarity_change'
-                , 'water_change'
-                , 'active_site']
-       
-    X_resprop_FN = X_aaindex_Fnum + X_str_Fnum + X_aap_Fcat
-    ###############################################################################
-    #========================
-    # FG5: Genomic features
-    #========================
-    X_gn_mafor_Fnum =  ['maf'
-                    #, 'logorI'
-                    # , 'or_rawI'
-                    # , 'or_mychisq'
-                    # , 'or_logistic'
-                    # , 'or_fisher'
-                    # , 'pval_fisher'
-                    ]
-    
-    X_gn_linegae_Fnum  = ['lineage_proportion'
-                          , 'dist_lineage_proportion'
-                          #, 'lineage' # could be included as a category but it has L2;L4  formatting
-                          , 'lineage_count_all'
-                          , 'lineage_count_unique'
-                          ]
-    
-    # X_gn_Fcat = ['drtype_mode_labels'  # beware then you can't use it to predict [USED it for uq_v1, not v2]
-    #                #, 'gene_name' # will be required for the combined stuff
-    #              ]
-    X_gn_Fcat = []
-    
-    X_genomicFN = X_gn_mafor_Fnum + X_gn_linegae_Fnum + X_gn_Fcat
-    ###############################################################################
-    #========================
-    # FG6 collapsed: Structural : Atability + Affinity + ResidueProp
-    #========================
-    X_structural_FN =  X_stability_FN + X_affinityFN + X_resprop_FN
-    
-    ###############################################################################
-    #========================
-    # BUILDING all features
-    #========================
-    all_featuresN = X_evolFN + X_structural_FN + X_genomicFN
-    
-    ###############################################################################
-    #%% Define training and test data
-    #================================================================
-    # Training and BLIND test set: scaling law split
-    #https://towardsdatascience.com/finally-why-we-use-an-80-20-split-for-training-and-test-data-plus-an-alternative-method-oh-yes-edc77e96295d
-    # dst with actual values  : training set
-    # dst with imputed values : THROW AWAY [unrepresentative]
-    # test data size ~ 1/sqrt(features NOT including target variable)
-    #================================================================
-    my_df_ml[drug].isna().sum()
-    
-    #    blind_test_df = my_df_ml[my_df_ml[drug].isna()]
-    #    blind_test_df.shape
-    
-    #training_df = my_df_ml[my_df_ml[drug].notna()]
-    #training_df.shape
-
-    training_df = my_df_ml.copy()
-    
-    # Target 1: dst_mode
-    training_df[drug].value_counts()
-    training_df['dst_mode'].value_counts()
-    
-    ####################################################################
-    #====================================
-    # ML data: Train test split: SL
-    # with stratification
-    # 1-blind test : training_data for CV
-    # 1/sqrt(columns) : blind test 
-    #===========================================
-    x_features = training_df[all_featuresN]
-    y_target   = training_df['dst_mode']
-    
-    # sanity check
-    if not 'dst_mode' in x_features.columns:
-        print('\nPASS: x_features has no target variable')
-        x_ncols = len(x_features.columns)
-        print('\nNo. of columns for x_features:', x_ncols)
-        # NEED It for scaling law split
-        #https://towardsdatascience.com/finally-why-we-use-an-80-20-split-for-training-and-test-data-plus-an-alternative-method-oh-yes-edc77e96295d
-    else:
-        sys.exit('\nFAIL: x_features has target variable included. FIX it and rerun!')
-    #-------------------
-    # train-test split
-    #-------------------
-    sl_test_size = 1/np.sqrt(x_ncols)
-    train = 1 - sl_test_size
-
-    #x_train, x_test, y_train, y_test # traditional var_names
-    # so my downstream code doesn't need to change    
-    X, X_bts, y, y_bts = train_test_split(x_features, y_target
-                                                    , test_size = sl_test_size
-                                                    , **rs
-                                                    , stratify = y_target)
-    yc1 = Counter(y)
-    yc1_ratio = yc1[0]/yc1[1]
-    
-    yc2 = Counter(y_bts)
-    yc2_ratio = yc2[0]/yc2[1]
-    
-    ###############################################################################
-    #======================================================
-    # Determine categorical and numerical features
-    #======================================================
-    numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
-    numerical_cols 
-    categorical_cols = X.select_dtypes(include=['object', 'bool']).columns
-    categorical_cols 
-    
-    ################################################################################
-    # IMPORTANT sanity checks
-    if len(X.columns) == len(X_evolFN) + len(X_stability_FN) + len(X_affinityFN) + len(X_resprop_FN) + len(X_genomicFN):
-        print('\nPASS: ML data with input features, training and test generated...'
-              , '\n\nTotal no. of input features:'        , len(X.columns)
-              , '\n--------No. of numerical features:'    , len(numerical_cols)
-              , '\n--------No. of categorical features:'  , len(categorical_cols)
-              
-              , '\n\nTotal no. of evolutionary features:' , len(X_evolFN)
-              
-              , '\n\nTotal no. of stability features:'    , len(X_stability_FN)
-              , '\n--------Common stabilty cols:'         , len(X_common_stability_Fnum)
-              , '\n--------Foldx cols:'                   , len(X_foldX_Fnum)
-              
-              , '\n\nTotal no. of affinity features:'     , len(X_affinityFN)
-              , '\n--------Common affinity cols:'         , len(common_affinity_Fnum)
-              , '\n--------Gene specific affinity cols:'  , len(gene_affinity_colnames)
-              
-              , '\n\nTotal no. of residue level features:', len(X_resprop_FN)
-              , '\n--------AA index cols:'                , len(X_aaindex_Fnum)
-              , '\n--------Residue Prop cols:'            , len(X_str_Fnum)
-              , '\n--------AA change Prop cols:'          , len(X_aap_Fcat)
-              
-              , '\n\nTotal no. of genomic features:'      , len(X_genomicFN)
-              , '\n--------MAF+OR cols:'                  , len(X_gn_mafor_Fnum)
-              , '\n--------Lineage cols:'                 , len(X_gn_linegae_Fnum)
-              , '\n--------Other cols:'                   , len(X_gn_Fcat)
-              )
-    else:
-        print('\nFAIL: numbers mismatch'
-              , '\nExpected:',len(X_evolFN) + len(X_stability_FN) + len(X_affinityFN) + len(X_resprop_FN) + len(X_genomicFN)
-              , '\nGot:', len(X.columns))
-        sys.exit()
-    ###############################################################################
-    print('\n-------------------------------------------------------------'
-          , '\nSuccessfully split data: ALL features'
-          , '\nactual values: training set'
-          ,  '\nSplit:', tts_split
-          #, '\nimputed values: blind test set'
-          
-          , '\n\nTotal data size:', len(X) + len(X_bts)
-    
-          , '\n\nTrain data size:', X.shape
-          , '\ny_train numbers:', yc1
-    
-          , '\n\nTest data size:', X_bts.shape
-          , '\ny_test_numbers:', yc2
-    
-          , '\n\ny_train ratio:',yc1_ratio
-          , '\ny_test ratio:', yc2_ratio
-          , '\n-------------------------------------------------------------'
-          )
-    ##########################################################################    
-    # Quick check
-    #(X['ligand_affinity_change']==0).sum() == (X['ligand_distance']>10).sum()
-    for i in range(len(cols_to_mask)):
-        ind = i+1
-        print('\nindex:', i, '\nind:', ind)
-        print('\nMask count check:'
-              , (my_df_ml[cols_to_mask[i]]==0).sum() == (my_df_ml['ligand_distance']>10).sum()
-              )
-    
-    print('Original Data\n', Counter(y)
-          , 'Data dim:', X.shape)
-    ###########################################################################
-    #%% 
-    ###########################################################################
-    #                               RESAMPLING
-    ###########################################################################
-    #------------------------------
-    # Simple Random oversampling
-    # [Numerical + catgeorical]
-    #------------------------------
-    oversample = RandomOverSampler(sampling_strategy='minority')
-    X_ros, y_ros = oversample.fit_resample(X, y)
-    print('\nSimple Random OverSampling\n', Counter(y_ros))
-    print(X_ros.shape)
-    
-    #------------------------------
-    # Simple Random Undersampling
-    # [Numerical + catgeorical]
-    #------------------------------
-    undersample = RandomUnderSampler(sampling_strategy='majority')
-    X_rus, y_rus = undersample.fit_resample(X, y)
-    print('\nSimple Random UnderSampling\n', Counter(y_rus))
-    print(X_rus.shape)
-    
-    #------------------------------
-    # Simple combine ROS and RUS
-    # [Numerical + catgeorical]
-    #------------------------------
-    oversample = RandomOverSampler(sampling_strategy='minority')
-    X_ros, y_ros = oversample.fit_resample(X, y)
-    undersample = RandomUnderSampler(sampling_strategy='majority')
-    
-    X_rouC, y_rouC = undersample.fit_resample(X_ros, y_ros)
-    print('\nSimple Combined Over and UnderSampling\n',  Counter(y_rouC))
-    print(X_rouC.shape)
-    
-    #------------------------------
-    # SMOTE_NC: oversampling 
-    # [numerical + categorical]
-    #https://stackoverflow.com/questions/47655813/oversampling-smote-for-binary-and-categorical-data-in-python
-    #------------------------------
-    # Determine categorical and numerical features
-    numerical_ix = X.select_dtypes(include=['int64', 'float64']).columns
-    numerical_ix
-    num_featuresL = list(numerical_ix)
-    numerical_colind = X.columns.get_indexer(list(numerical_ix) )
-    numerical_colind
-    
-    categorical_ix = X.select_dtypes(include=['object', 'bool']).columns
-    categorical_ix    
-    categorical_colind = X.columns.get_indexer(list(categorical_ix))
-    categorical_colind
-    
-    k_sm = 5 # 5 is default
-    sm_nc = SMOTENC(categorical_features=categorical_colind, k_neighbors = k_sm, **rs, **njobs)
-    X_smnc, y_smnc = sm_nc.fit_resample(X, y)
-    print('\nSMOTE_NC OverSampling\n', Counter(y_smnc))
-    print(X_smnc.shape)
-    globals().update(locals()) # TROLOLOLOLOLOLS
-    #print("i did a horrible hack :-)")
-    ###############################################################################
-    #%% SMOTE RESAMPLING for NUMERICAL ONLY*
-    # #------------------------------
-    # # SMOTE: Oversampling
-    # # [Numerical ONLY]
-    # #------------------------------
-    # k_sm = 1
-    # sm = SMOTE(sampling_strategy = 'auto', k_neighbors = k_sm, **rs)
-    # X_sm, y_sm = sm.fit_resample(X, y)
-    # print(X_sm.shape)
-    # print('\nSMOTE OverSampling\n', Counter(y_sm))
-    # y_sm_df = y_sm.to_frame()
-    # y_sm_df.value_counts().plot(kind = 'bar')
-    
-    # #------------------------------
-    # # SMOTE: Over + Undersampling COMBINED
-    # # [Numerical ONLY]
-    # #-----------------------------
-    # sm_enn = SMOTEENN(enn=EditedNearestNeighbours(sampling_strategy='all', **rs, **njobs ))
-    # X_enn, y_enn = sm_enn.fit_resample(X, y)
-    # print(X_enn.shape)
-    # print('\nSMOTE Over+Under Sampling combined\n', Counter(y_enn))
-    
-    ###############################################################################
-    # TODO: Find over and undersampling JUST for categorical data
-        ###########################################################################
-    
-    print('\n#################################################################'
-          , '\nDim of X for gene:', gene.lower(), '\n',  X.shape
-          , '\n###############################################################')
--- a/scripts/ml/ml_data_fg.py
+++ b/scripts/ml/ml_data_fg.py
@ -1,787 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Sun Mar  6 13:41:54 2022
-
-@author: tanu
-"""
-def setvars(gene,drug):
-    #https://stackoverflow.com/questions/51695322/compare-multiple-algorithms-with-sklearn-pipeline
-    import os, sys
-    import pandas as pd
-    import numpy as np
-    print(np.__version__)
-    print(pd.__version__)
-    import pprint as pp
-    from copy import deepcopy
-    from collections import Counter
-    from sklearn.impute import KNNImputer as KNN
-    from imblearn.over_sampling import RandomOverSampler
-    from imblearn.under_sampling import RandomUnderSampler
-    from imblearn.over_sampling import SMOTE
-    from sklearn.datasets import make_classification
-    from imblearn.combine import SMOTEENN
-    from imblearn.combine import SMOTETomek
-    
-    from imblearn.over_sampling import SMOTENC
-    from imblearn.under_sampling import EditedNearestNeighbours
-    from imblearn.under_sampling import RepeatedEditedNearestNeighbours
-    
-    from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score
-    from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report
-    
-    from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
-    from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold
-    
-    from sklearn.pipeline import Pipeline, make_pipeline
-    import argparse
-    import re
-    #%% GLOBALS
-    rs = {'random_state': 42}
-    njobs = {'n_jobs': 10}
-    
-    scoring_fn =  ({ 'mcc'         : make_scorer(matthews_corrcoef)
-                    , 'accuracy'   : make_scorer(accuracy_score)
-                    , 'fscore'     : make_scorer(f1_score)
-                    , 'precision'  : make_scorer(precision_score)
-                    , 'recall'     : make_scorer(recall_score)
-                    , 'roc_auc'    : make_scorer(roc_auc_score)
-                    , 'jcc'        : make_scorer(jaccard_score)
-                }) 
-      
-    skf_cv = StratifiedKFold(n_splits = 10
-                              #, shuffle = False, random_state= None)
-                               , shuffle = True,**rs)
-    
-    rskf_cv = RepeatedStratifiedKFold(n_splits = 10
-                                      , n_repeats = 3
-                                      , **rs)
-    
-    mcc_score_fn  = {'mcc': make_scorer(matthews_corrcoef)}
-    jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
-    
-    #%% FOR LATER: Combine ED logo data
-    ###########################################################################
-    rs = {'random_state': 42}
-    njobs = {'n_jobs': 10}
-    homedir = os.path.expanduser("~")
-    
-    geneL_basic     = ['pnca']
-    geneL_na        = ['gid']
-    geneL_na_ppi2   = ['rpob']
-    geneL_ppi2      = ['alr', 'embb', 'katg']
-    
-    #num_type = ['int64', 'float64']
-    num_type = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
-    cat_type = ['object', 'bool']
-    
-    #==============
-    # directories
-    #==============
-    datadir = homedir + '/git/Data/'
-    indir   = datadir + drug + '/input/'
-    outdir  = datadir + drug + '/output/'
-    
-    #=======
-    # input
-    #=======
-    
-    #---------
-    # File 1
-    #---------
-    infile_ml1 = outdir + gene.lower() + '_merged_df3.csv' 
-    #infile_ml2 = outdir + gene.lower() + '_merged_df2.csv'
-    
-    my_features_df = pd.read_csv(infile_ml1, index_col = 0) 
-    my_features_df  = my_features_df .reset_index(drop = True)
-    my_features_df.index
-    
-    my_features_df.dtypes
-    mycols = my_features_df.columns
-    
-    #---------
-    # File 2
-    #---------
-    infile_aaindex = outdir + 'aa_index/' + gene.lower() + '_aa.csv' 
-    aaindex_df = pd.read_csv(infile_aaindex, index_col = 0) 
-    aaindex_df.dtypes
-    
-    #-----------
-    # check for non-numerical columns
-    #-----------
-    if any(aaindex_df.dtypes==object):
-        print('\naaindex_df contains non-numerical data')
-    
-    aaindex_df_object = aaindex_df.select_dtypes(include = cat_type)
-    print('\nTotal no. of non-numerial columns:', len(aaindex_df_object.columns))
-    
-    expected_aa_ncols = len(aaindex_df.columns) - len(aaindex_df_object.columns)
-    
-    #-----------
-    # Extract numerical data only
-    #-----------
-    print('\nSelecting numerical data only')
-    aaindex_df = aaindex_df.select_dtypes(include = num_type)
-    
-    #---------------------------
-    # aaindex: sanity check 1
-    #---------------------------
-    if len(aaindex_df.columns) == expected_aa_ncols:
-        print('\nPASS: successfully selected numerical columns only for aaindex_df')
-    else:
-        print('\nFAIL: Numbers mismatch'
-              , '\nExpected ncols:', expected_aa_ncols
-              , '\nGot:', len(aaindex_df.columns))    
-        
-    #---------------
-    # check for NA
-    #---------------
-    print('\nNow checking for NA in the remaining aaindex_cols')
-    c1 = aaindex_df.isna().sum()
-    c2 = c1.sort_values(ascending=False)
-    print('\nCounting aaindex_df cols with NA'
-          , '\nncols with NA:', sum(c2>0), 'columns'
-          , '\nDropping these...'
-          , '\nOriginal ncols:', len(aaindex_df.columns)
-          )
-    aa_df = aaindex_df.dropna(axis=1)
-    
-    print('\nRevised df ncols:', len(aa_df.columns))
-    
-    c3 = aa_df.isna().sum()
-    c4 = c3.sort_values(ascending=False)
-    
-    print('\nChecking NA in revised df...')
-    
-    if sum(c4>0):
-        sys.exit('\nFAIL: aaindex_df still contains cols with NA, please check and drop these before proceeding...')
-    else:
-        print('\nPASS: cols with NA successfully dropped from aaindex_df'
-              , '\nProceeding with combining aa_df with other features_df')
-        
-    #---------------------------
-    # aaindex: sanity check 2
-    #---------------------------
-    expected_aa_ncols2 =  len(aaindex_df.columns) - sum(c2>0)  
-    if len(aa_df.columns) == expected_aa_ncols2:
-        print('\nPASS: ncols match'
-              , '\nExpected ncols:', expected_aa_ncols2
-              , '\nGot:', len(aa_df.columns))
-    else:
-        print('\nFAIL: Numbers mismatch'
-              , '\nExpected ncols:', expected_aa_ncols2
-              , '\nGot:', len(aa_df.columns))            
-        
-    # Important: need this to identify aaindex cols    
-    aa_df_cols = aa_df.columns
-    print('\nTotal no. of columns in clean aa_df:', len(aa_df_cols))
-    
-    ###############################################################################
-    #%% Combining my_features_df and aaindex_df
-    #===========================
-    # Merge my_df + aaindex_df
-    #===========================
-    
-    if aa_df.columns[aa_df.columns.isin(my_features_df.columns)] == my_features_df.columns[my_features_df.columns.isin(aa_df.columns)]:
-        print('\nMerging on column: mutationinformation')   
-    
-    if len(my_features_df) == len(aa_df):
-        expected_nrows = len(my_features_df)
-        print('\nProceeding to merge, expected nrows in merged_df:', expected_nrows)
-    else:
-        sys.exit('\nNrows mismatch, cannot merge. Please check'
-              , '\nnrows my_df:', len(my_features_df)
-              , '\nnrows aa_df:', len(aa_df))
-               
-    #-----------------
-    # Reset index: mutationinformation
-    # Very important for merging
-    #-----------------
-    aa_df = aa_df.reset_index()
-    
-    expected_ncols = len(my_features_df.columns) + len(aa_df.columns) - 1 # for the no. of merging col
-    
-    #-----------------
-    # Merge: my_features_df + aa_df
-    #-----------------
-    merged_df = pd.merge(my_features_df
-                         , aa_df
-                         , on = 'mutationinformation')
-    
-    #---------------------------
-    # aaindex: sanity check 3
-    #---------------------------
-    if len(merged_df.columns) == expected_ncols:
-        print('\nPASS: my_features_df and aa_df successfully combined'
-              , '\nnrows:', len(merged_df)
-              , '\nncols:', len(merged_df.columns))
-    else:
-        sys.exit('\nFAIL: could not combine my_features_df and aa_df'
-                 , '\nCheck dims and merging cols!')
-        
-    #--------
-    # Reassign so downstream code doesn't need to change
-    #--------
-    my_df = merged_df.copy()
-    
-    #%% Data: my_df
-    # Check if non structural pos have crept in
-    # IDEALLY remove from source! But for rpoB do it here
-    # Drop NA where numerical cols have them
-    if gene.lower() in geneL_na_ppi2:
-        #D1148 get rid of
-        na_index = my_df['mutationinformation'].index[my_df['mcsm_na_affinity'].apply(np.isnan)]
-        my_df = my_df.drop(index=na_index)
-    
-    # FIXED: complete data for all muts inc L114M, F115L, V123L, V125I, V131M
-    # if gene.lower() in ['embb']:
-    #     na_index = my_df['mutationinformation'].index[my_df['ligand_distance'].apply(np.isnan)]
-    #     my_df = my_df.drop(index=na_index)
-    
-    # # Sanity check for non-structural positions
-    # print('\nChecking for non-structural postions')
-    # na_index = my_df['mutationinformation'].index[my_df['ligand_distance'].apply(np.isnan)]
-    # if len(na_index) > 0:
-    #     print('\nNon-structural positions detected for gene:', gene.lower()
-    #           , '\nTotal number of these detected:', len(na_index)
-    #           , '\These are at index:', na_index
-    #           , '\nOriginal nrows:', len(my_df)
-    #           , '\nDropping these...')
-    #     my_df = my_df.drop(index=na_index)
-    #     print('\nRevised nrows:', len(my_df))
-    # else:
-    #     print('\nNo non-structural positions detected for gene:', gene.lower()
-    #           , '\nnrows:', len(my_df))
-              
-    
-    ###########################################################################
-    #%% Add lineage calculation columns
-    #FIXME: Check if this can be imported from config?
-    total_mtblineage_uc = 8
-    lineage_colnames = ['lineage_list_all', 'lineage_count_all', 'lineage_count_unique', 'lineage_list_unique', 'lineage_multimode']
-    #bar = my_df[lineage_colnames]
-    my_df['lineage_proportion']      = my_df['lineage_count_unique']/my_df['lineage_count_all']
-    my_df['dist_lineage_proportion'] = my_df['lineage_count_unique']/total_mtblineage_uc
-    ###########################################################################
-    #%% Active site annotation column
-    # change from numberic to categorical
-    
-    if my_df['active_site'].dtype in num_type:
-        my_df['active_site'] = my_df['active_site'].astype(object)
-        my_df['active_site'].dtype
-    #%% AA property change
-    #--------------------
-    # Water prop change
-    #--------------------
-    my_df['water_change'] = my_df['wt_prop_water'] + str('_to_') + my_df['mut_prop_water']
-    my_df['water_change'].value_counts()
-    
-    water_prop_changeD = {
-        'hydrophobic_to_neutral'          : 'change'
-        , 'hydrophobic_to_hydrophobic'    : 'no_change'
-        , 'neutral_to_neutral'            : 'no_change'
-        , 'neutral_to_hydrophobic'        : 'change'
-        , 'hydrophobic_to_hydrophilic'    : 'change'
-        , 'neutral_to_hydrophilic'        : 'change'
-        , 'hydrophilic_to_neutral'        : 'change'
-        , 'hydrophilic_to_hydrophobic'    : 'change'
-        , 'hydrophilic_to_hydrophilic'    : 'no_change'
-    }
-    
-    my_df['water_change'] = my_df['water_change'].map(water_prop_changeD)
-    my_df['water_change'].value_counts()
-    
-    #--------------------
-    # Polarity change
-    #--------------------
-    my_df['polarity_change'] = my_df['wt_prop_polarity'] + str('_to_') + my_df['mut_prop_polarity']
-    my_df['polarity_change'].value_counts()
-    
-    polarity_prop_changeD = {
-        'non-polar_to_non-polar'     : 'no_change'
-        , 'non-polar_to_neutral'     : 'change'  
-        , 'neutral_to_non-polar'     : 'change'  
-        , 'neutral_to_neutral'       : 'no_change'  
-        , 'non-polar_to_basic'       : 'change'  
-        , 'acidic_to_neutral'        : 'change'  
-        , 'basic_to_neutral'         : 'change'  
-        , 'non-polar_to_acidic'      : 'change'  
-        , 'neutral_to_basic'         : 'change'  
-        , 'acidic_to_non-polar'      : 'change'  
-        , 'basic_to_non-polar'       : 'change'
-        , 'neutral_to_acidic'        : 'change'
-        , 'acidic_to_acidic'         : 'no_change'
-        , 'basic_to_acidic'          : 'change'
-        , 'basic_to_basic'           : 'no_change'
-        , 'acidic_to_basic'          : 'change'}
-    
-    my_df['polarity_change'] = my_df['polarity_change'].map(polarity_prop_changeD)
-    my_df['polarity_change'].value_counts()
-    
-    #--------------------
-    # Electrostatics change
-    #--------------------
-    my_df['electrostatics_change'] = my_df['wt_calcprop'] + str('_to_') + my_df['mut_calcprop']
-    my_df['electrostatics_change'].value_counts()
-    
-    calc_prop_changeD = {
-            'non-polar_to_non-polar'     : 'no_change'
-            , 'non-polar_to_polar'       : 'change'
-            , 'polar_to_non-polar'       : 'change'
-            , 'non-polar_to_pos'         : 'change'
-            , 'neg_to_non-polar'         : 'change'
-            , 'non-polar_to_neg'         : 'change'
-            , 'pos_to_polar'             : 'change'
-            , 'pos_to_non-polar'         : 'change'
-            , 'polar_to_polar'           : 'no_change'
-            , 'neg_to_neg'               : 'no_change'
-            , 'polar_to_neg'             : 'change'
-            , 'pos_to_neg'               : 'change'
-            , 'pos_to_pos'               : 'no_change'
-            , 'polar_to_pos'             : 'change'
-            , 'neg_to_polar'             : 'change'
-            , 'neg_to_pos'               : 'change'
-    }
-    
-    my_df['electrostatics_change'] = my_df['electrostatics_change'].map(calc_prop_changeD)
-    my_df['electrostatics_change'].value_counts()
-    
-    #--------------------    
-    # Summary change: Create a combined column summarising these three cols
-    #--------------------
-    detect_change = 'change'
-    check_prop_cols = ['water_change', 'polarity_change', 'electrostatics_change']
-    #my_df['aa_prop_change'] = (my_df.values == detect_change).any(1).astype(int)
-    my_df['aa_prop_change'] = (my_df[check_prop_cols].values == detect_change).any(1).astype(int)
-    my_df['aa_prop_change'].value_counts()
-    my_df['aa_prop_change'].dtype
-    
-    my_df['aa_prop_change'] = my_df['aa_prop_change'].map({1:'change'
-                                                           , 0: 'no_change'})
-    
-    my_df['aa_prop_change'].value_counts()
-    my_df['aa_prop_change'].dtype
-    
-    #%% IMPUTE values for OR [check script for exploration: UQ_or_imputer]
-    #--------------------
-    # Impute OR values
-    #--------------------
-    #or_cols = ['or_mychisq', 'log10_or_mychisq', 'or_fisher']
-    sel_cols = ['mutationinformation', 'or_mychisq', 'log10_or_mychisq']
-    or_cols = ['or_mychisq', 'log10_or_mychisq']
-    
-    print("count of NULL values before imputation\n")
-    print(my_df[or_cols].isnull().sum())
-    
-    my_dfI = pd.DataFrame(index = my_df['mutationinformation'] )
-    
-        
-    my_dfI = pd.DataFrame(KNN(n_neighbors=3, weights="uniform").fit_transform(my_df[or_cols])
-                          , index =  my_df['mutationinformation']
-                          , columns = or_cols )
-    my_dfI.columns = ['or_rawI', 'logorI']
-    my_dfI.columns
-    my_dfI = my_dfI.reset_index(drop = False) # prevents old index from being added as a column
-    my_dfI.head()
-    print("count of NULL values AFTER imputation\n")
-    print(my_dfI.isnull().sum())
-    
-    #-------------------------------------------
-    # OR df Merge: with original based on index
-    #-------------------------------------------
-    #my_df['index_bm'] = my_df.index
-    mydf_imputed = pd.merge(my_df
-                        , my_dfI
-                        , on = 'mutationinformation')
-    #mydf_imputed = mydf_imputed.set_index(['index_bm'])
-    
-    my_df['log10_or_mychisq'].isna().sum()
-    mydf_imputed['log10_or_mychisq'].isna().sum()
-    mydf_imputed['logorI'].isna().sum() # should be 0
-    
-    len(my_df.columns)
-    len(mydf_imputed.columns)  
-    
-    #-----------------------------------------
-    # REASSIGN my_df after imputing OR values
-    #-----------------------------------------
-    my_df = mydf_imputed.copy()
-    
-    if my_df['logorI'].isna().sum() == 0:
-        print('\nPASS: OR values imputed, data ready for ML')
-    else:
-        sys.exit('\nFAIL: something went wrong, Data not ready for ML. Please check upstream!')
-    
-    #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-    #---------------------------------------
-    # TODO: try other imputation like MICE
-    #---------------------------------------
-    #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-    
-    #%%########################################################################
-    #==========================
-    #     Data for ML
-    #==========================
-    my_df_ml = my_df.copy()
-    
-    # Build column names to mask for affinity chanhes
-    if gene.lower() in geneL_basic:
-        #X_stabilityN = common_cols_stabiltyN
-        gene_affinity_colnames = []# not needed as its the common ones 
-        cols_to_mask = ['ligand_affinity_change']
-        
-    if gene.lower() in geneL_ppi2:
-        gene_affinity_colnames = ['mcsm_ppi2_affinity', 'interface_dist'] 
-        #X_stabilityN = common_cols_stabiltyN + geneL_ppi2_st_cols
-        cols_to_mask = ['ligand_affinity_change', 'mcsm_ppi2_affinity']
-    
-    if gene.lower() in geneL_na:
-        gene_affinity_colnames =  ['mcsm_na_affinity'] 
-        #X_stabilityN = common_cols_stabiltyN + geneL_na_st_cols
-        cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity']
-    
-    if gene.lower() in geneL_na_ppi2:
-        gene_affinity_colnames = ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] 
-        #X_stabilityN = common_cols_stabiltyN + geneL_na_ppi2_st_cols
-        cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity', 'mcsm_ppi2_affinity']
-    
-    #=======================
-    # Masking columns:
-    # (mCSM-lig, mCSM-NA, mCSM-ppi2) values for lig_dist >10
-    #=======================
-    my_df_ml['mutationinformation'][my_df_ml['ligand_distance']>10].value_counts()
-    my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts()
-    my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask].value_counts()
-    
-    # mask the mcsm affinity related columns where ligand distance > 10
-    my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask] = 0
-    (my_df_ml['ligand_affinity_change'] == 0).sum()
-    
-    mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask]  
-    
-    #===================================================
-    # write file for check
-    mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
-    mask_check.to_csv(outdir + 'ml/' + gene.lower() + '_mask_check.csv')
-    #===================================================
-    ###############################################################################
-    #%% Feature groups (FG): Build X for Input ML 
-    ############################################################################
-    #===========================
-    # FG1: Evolutionary features
-    #===========================
-    X_evolFN =  ['consurf_score'
-               , 'snap2_score'
-               , 'provean_score']
-    
-    ###############################################################################
-    #========================
-    # FG2: Stability features
-    #========================
-    #--------
-    # common
-    #--------
-    X_common_stability_Fnum = [
-               'duet_stability_change'
-               , 'ddg_foldx'
-               , 'deepddg'
-               , 'ddg_dynamut2'
-               , 'contacts']
-    #--------
-    # FoldX
-    #--------
-    X_foldX_Fnum = [ 'electro_rr', 'electro_mm', 'electro_sm', 'electro_ss'
-    , 'disulfide_rr', 'disulfide_mm', 'disulfide_sm', 'disulfide_ss'
-    , 'hbonds_rr', 'hbonds_mm', 'hbonds_sm', 'hbonds_ss'
-    , 'partcov_rr', 'partcov_mm', 'partcov_sm', 'partcov_ss'
-    , 'vdwclashes_rr', 'vdwclashes_mm', 'vdwclashes_sm', 'vdwclashes_ss'
-    , 'volumetric_rr', 'volumetric_mm', 'volumetric_ss']
-    
-    X_stability_FN = X_common_stability_Fnum + X_foldX_Fnum
-    
-    ###############################################################################
-    #===================
-    # FG3: Affinity features
-    #===================
-    common_affinity_Fnum =  ['ligand_distance'
-                    , 'ligand_affinity_change'
-                    , 'mmcsm_lig']
-    
-    # if gene.lower() in geneL_basic:
-    #     X_affinityFN = common_affinity_Fnum 
-    # else:
-    #     X_affinityFN = common_affinity_Fnum + gene_affinity_colnames
-        
-    X_affinityFN = common_affinity_Fnum + gene_affinity_colnames
-    
-    ###############################################################################
-    #============================
-    # FG4: Residue level features
-    #============================
-    #-----------
-    # AA index
-    #-----------
-    X_aaindex_Fnum = list(aa_df_cols)
-    print('\nTotal no. of features for aaindex:', len(X_aaindex_Fnum))
-    
-    #-----------------
-    # surface area
-    # depth
-    # hydrophobicity
-    #-----------------
-    X_str_Fnum =  ['rsa'
-               #, 'asa'
-               , 'kd_values'
-               , 'rd_values']   
-    
-    #---------------------------
-    # Other aa properties
-    # active site indication
-    #---------------------------
-    X_aap_Fcat = ['ss_class'
-                # , 'wt_prop_water'
-                # , 'mut_prop_water'
-                # , 'wt_prop_polarity'
-                # , 'mut_prop_polarity'
-                # , 'wt_calcprop'
-                # , 'mut_calcprop'
-                , 'aa_prop_change'
-                , 'electrostatics_change'
-                , 'polarity_change'
-                , 'water_change'
-                , 'active_site']
-       
-    X_resprop_FN = X_aaindex_Fnum + X_str_Fnum + X_aap_Fcat
-    ###############################################################################
-    #========================
-    # FG5: Genomic features
-    #========================
-    X_gn_mafor_Fnum =  ['maf'
-                    , 'logorI'
-                    # , 'or_rawI'
-                    # , 'or_mychisq'
-                    # , 'or_logistic'
-                    # , 'or_fisher'
-                    # , 'pval_fisher'
-                    ]
-    
-    X_gn_linegae_Fnum  = ['lineage_proportion'
-                          , 'dist_lineage_proportion'
-                          #, 'lineage' # could be included as a category but it has L2;L4  formatting
-                          , 'lineage_count_all'
-                          , 'lineage_count_unique'
-                          ]
-    
-    X_gn_Fcat = ['drtype_mode_labels'  # beware then you can't use it to predict [USED it for uq_v1, not v2]
-                   #, 'gene_name' # will be required for the combined stuff
-                 ]
-    
-    X_genomicFN = X_gn_mafor_Fnum + X_gn_linegae_Fnum + X_gn_Fcat
-    ###############################################################################
-    #========================
-    # FG6 collapsed: Structural : Atability + Affinity + ResidueProp
-    #========================
-    X_structural_FN =  X_stability_FN + X_affinityFN + X_resprop_FN
-    
-    ###############################################################################
-    #========================
-    # BUILDING all features
-    #========================
-    all_featuresN = X_evolFN + X_structural_FN + X_genomicFN
-    
-    ###############################################################################
-    #%% Define training and test data
-    #======================================================
-    # Training and BLIND test set: actual vs imputed
-    # dst with actual values  : training set
-    # dst with imputed values : blind test
-    #======================================================
-    my_df_ml[drug].isna().sum()  #'na' ones are the blind_test set
-    
-    blind_test_df = my_df_ml[my_df_ml[drug].isna()]
-    blind_test_df.shape
-    
-    training_df = my_df_ml[my_df_ml[drug].notna()]
-    training_df.shape
-    
-    # Target 1: dst_mode
-    training_df[drug].value_counts()
-    training_df['dst_mode'].value_counts()
-    
-    ####################################################################
-    #=====================================
-    # ML data: actual vs imputed 
-    #=====================================
-    #------
-    # X: Training and Blind test (BTS)
-    #------
-    X     = training_df[all_featuresN] 
-    X_bts = blind_test_df[all_featuresN] 
-    
-    #------
-    # y
-    #------
-    y     = training_df['dst_mode']
-    y_bts = blind_test_df['dst_mode']  
-   
-    yc1 = Counter(y)
-    yc1_ratio = yc1[0]/yc1[1]
-    
-    yc2 = Counter(y_bts)
-    yc2_ratio = yc2[0]/yc2[1]
-    
-    ###############################################################################
-    #======================================================
-    # Determine categorical and numerical features
-    #======================================================
-    numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
-    numerical_cols 
-    categorical_cols = X.select_dtypes(include=['object', 'bool']).columns
-    categorical_cols 
-    
-    ################################################################################
-    # IMPORTANT sanity checks
-    if len(X.columns) == len(X_evolFN) + len(X_stability_FN) + len(X_affinityFN) + len(X_resprop_FN) + len(X_genomicFN):
-        print('\nPASS: ML data with input features, training and test generated...'
-              , '\n\nTotal no. of input features:'        , len(X.columns)
-              , '\n--------No. of numerical features:'    , len(numerical_cols)
-              , '\n--------No. of categorical features:'  , len(categorical_cols)
-              
-              , '\n\nTotal no. of evolutionary features:' , len(X_evolFN)
-              
-              , '\n\nTotal no. of stability features:'    , len(X_stability_FN)
-              , '\n--------Common stabilty cols:'         , len(X_common_stability_Fnum)
-              , '\n--------Foldx cols:'                   , len(X_foldX_Fnum)
-              
-              , '\n\nTotal no. of affinity features:'     , len(X_affinityFN)
-              , '\n--------Common affinity cols:'         , len(common_affinity_Fnum)
-              , '\n--------Gene specific affinity cols:'  , len(gene_affinity_colnames)
-              
-              , '\n\nTotal no. of residue level features:', len(X_resprop_FN)
-              , '\n--------AA index cols:'                , len(X_aaindex_Fnum)
-              , '\n--------Residue Prop cols:'            , len(X_str_Fnum)
-              , '\n--------AA change Prop cols:'          , len(X_aap_Fcat)
-              
-              , '\n\nTotal no. of genomic features:'      , len(X_genomicFN)
-              , '\n--------MAF+OR cols:'                  , len(X_gn_mafor_Fnum)
-              , '\n--------Lineage cols:'                 , len(X_gn_linegae_Fnum)
-              , '\n--------Other cols:'                   , len(X_gn_Fcat)
-              )
-    else:
-        print('\nFAIL: numbers mismatch'
-              , '\nExpected:',len(X_evolFN) + len(X_stability_FN) + len(X_affinityFN) + len(X_resprop_FN) + len(X_genomicFN)
-              , '\nGot:', len(X.columns))
-        sys.exit()
-    ###############################################################################
-    print('\n-------------------------------------------------------------'
-          , '\nSuccessfully split data: ALL features'
-          , '\nactual values: training set'
-          , '\nimputed values: blind test set'
-          
-          , '\n\nTotal data size:', len(X) + len(X_bts)
-    
-          , '\n\nTrain data size:', X.shape
-          , '\ny_train numbers:', yc1
-    
-          , '\n\nTest data size:', X_bts.shape
-          , '\ny_test_numbers:', yc2
-    
-          , '\n\ny_train ratio:',yc1_ratio
-          , '\ny_test ratio:', yc2_ratio
-          , '\n-------------------------------------------------------------'
-          )
-    ##########################################################################    
-    # Quick check
-    #(X['ligand_affinity_change']==0).sum() == (X['ligand_distance']>10).sum()
-    for i in range(len(cols_to_mask)):
-        ind = i+1
-        print('\nindex:', i, '\nind:', ind)
-        print('\nMask count check:'
-              , (my_df_ml[cols_to_mask[i]]==0).sum() == (my_df_ml['ligand_distance']>10).sum()
-              )
-    
-    print('Original Data\n', Counter(y)
-          , 'Data dim:', X.shape)
-    ###########################################################################
-    #%% 
-    ###########################################################################
-    #                               RESAMPLING
-    ###########################################################################
-    #------------------------------
-    # Simple Random oversampling
-    # [Numerical + catgeorical]
-    #------------------------------
-    oversample = RandomOverSampler(sampling_strategy='minority')
-    X_ros, y_ros = oversample.fit_resample(X, y)
-    print('Simple Random OverSampling\n', Counter(y_ros))
-    print(X_ros.shape)
-    
-    #------------------------------
-    # Simple Random Undersampling
-    # [Numerical + catgeorical]
-    #------------------------------
-    undersample = RandomUnderSampler(sampling_strategy='majority')
-    X_rus, y_rus = undersample.fit_resample(X, y)
-    print('Simple Random UnderSampling\n', Counter(y_rus))
-    print(X_rus.shape)
-    
-    #------------------------------
-    # Simple combine ROS and RUS
-    # [Numerical + catgeorical]
-    #------------------------------
-    oversample = RandomOverSampler(sampling_strategy='minority')
-    X_ros, y_ros = oversample.fit_resample(X, y)
-    undersample = RandomUnderSampler(sampling_strategy='majority')
-    X_rouC, y_rouC = undersample.fit_resample(X_ros, y_ros)
-    print('Simple Combined Over and UnderSampling\n',  Counter(y_rouC))
-    print(X_rouC.shape)
-    
-    #------------------------------
-    # SMOTE_NC: oversampling 
-    # [numerical + categorical]
-    #https://stackoverflow.com/questions/47655813/oversampling-smote-for-binary-and-categorical-data-in-python
-    #------------------------------
-    # Determine categorical and numerical features
-    numerical_ix = X.select_dtypes(include=['int64', 'float64']).columns
-    numerical_ix
-    num_featuresL = list(numerical_ix)
-    numerical_colind = X.columns.get_indexer(list(numerical_ix) )
-    numerical_colind
-    
-    categorical_ix = X.select_dtypes(include=['object', 'bool']).columns
-    categorical_ix    
-    categorical_colind = X.columns.get_indexer(list(categorical_ix))
-    categorical_colind
-    
-    k_sm = 5 # 5 is deafult
-    sm_nc = SMOTENC(categorical_features=categorical_colind, k_neighbors = k_sm, **rs, **njobs)
-    X_smnc, y_smnc = sm_nc.fit_resample(X, y)
-    print('\nSMOTE_NC OverSampling\n', Counter(y_smnc))
-    print(X_smnc.shape)
-    globals().update(locals()) # TROLOLOLOLOLOLS
-    #print("i did a horrible hack :-)")
-    ###############################################################################
-    #%% SMOTE RESAMPLING for NUMERICAL ONLY*
-    # #------------------------------
-    # # SMOTE: Oversampling
-    # # [Numerical ONLY]
-    # #------------------------------
-    # k_sm = 1
-    # sm = SMOTE(sampling_strategy = 'auto', k_neighbors = k_sm, **rs)
-    # X_sm, y_sm = sm.fit_resample(X, y)
-    # print(X_sm.shape)
-    # print('\nSMOTE OverSampling\n', Counter(y_sm))
-    # y_sm_df = y_sm.to_frame()
-    # y_sm_df.value_counts().plot(kind = 'bar')
-    
-    # #------------------------------
-    # # SMOTE: Over + Undersampling COMBINED
-    # # [Numerical ONLY]
-    # #-----------------------------
-    # sm_enn = SMOTEENN(enn=EditedNearestNeighbours(sampling_strategy='all', **rs, **njobs ))
-    # X_enn, y_enn = sm_enn.fit_resample(X, y)
-    # print(X_enn.shape)
-    # print('\nSMOTE Over+Under Sampling combined\n', Counter(y_enn))
-    
-    ###############################################################################
-    # TODO: Find over and undersampling JUST for categorical data
--- a/scripts/ml/ml_data_orig.py
+++ b/scripts/ml/ml_data_orig.py
@ -1,700 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Sun Mar  6 13:41:54 2022
-
-@author: tanu
-"""
-def setvars(gene,drug):
-    #https://stackoverflow.com/questions/51695322/compare-multiple-algorithms-with-sklearn-pipeline
-    import os, sys
-    import pandas as pd
-    import numpy as np
-    print(np.__version__)
-    print(pd.__version__)
-    import pprint as pp
-    from copy import deepcopy
-    from collections import Counter
-    from sklearn.impute import KNNImputer as KNN
-    from imblearn.over_sampling import RandomOverSampler
-    from imblearn.under_sampling import RandomUnderSampler
-    from imblearn.over_sampling import SMOTE
-    from sklearn.datasets import make_classification
-    from imblearn.combine import SMOTEENN
-    from imblearn.combine import SMOTETomek
-    
-    from imblearn.over_sampling import SMOTENC
-    from imblearn.under_sampling import EditedNearestNeighbours
-    from imblearn.under_sampling import RepeatedEditedNearestNeighbours
-    
-    from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score
-    from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report
-    
-    from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
-    from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold
-    
-    from sklearn.pipeline import Pipeline, make_pipeline
-    #%% GLOBALS
-    rs = {'random_state': 42}
-    njobs = {'n_jobs': 10}
-    
-    scoring_fn =  ({ 'mcc'         : make_scorer(matthews_corrcoef)
-                    , 'accuracy'   : make_scorer(accuracy_score)
-                    , 'fscore'     : make_scorer(f1_score)
-                    , 'precision'  : make_scorer(precision_score)
-                    , 'recall'     : make_scorer(recall_score)
-                    , 'roc_auc'    : make_scorer(roc_auc_score)
-                    , 'jcc'        : make_scorer(jaccard_score)
-                }) 
-      
-    skf_cv = StratifiedKFold(n_splits = 10
-                              #, shuffle = False, random_state= None)
-                               , shuffle = True,**rs)
-    
-    rskf_cv = RepeatedStratifiedKFold(n_splits = 10
-                                      , n_repeats = 3
-                                      , **rs)
-    
-    mcc_score_fn  = {'mcc': make_scorer(matthews_corrcoef)}
-    jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
-    
-    #%% FOR LATER: Combine ED logo data
-    #%% DONE: active aa site annotations **DONE on 15/05/2022 as part of generating merged_dfs
-    ###########################################################################
-    rs = {'random_state': 42}
-    njobs = {'n_jobs': 10}
-    homedir = os.path.expanduser("~")
-    
-    geneL_basic     = ['pnca']
-    geneL_na        = ['gid']
-    geneL_na_ppi2   = ['rpob']
-    geneL_ppi2      = ['alr', 'embb', 'katg']
-    
-    #num_type = ['int64', 'float64']
-    num_type = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
-    cat_type = ['object', 'bool']
-    
-    #==============
-    # directories
-    #==============
-    datadir = homedir + '/git/Data/'
-    indir   = datadir + drug + '/input/'
-    outdir  = datadir + drug + '/output/'
-    
-    #=======
-    # input
-    #=======
-    
-    #---------
-    # File 1
-    #---------
-    infile_ml1 = outdir + gene.lower() + '_merged_df3.csv' 
-    #infile_ml2 = outdir + gene.lower() + '_merged_df2.csv'
-    
-    my_features_df = pd.read_csv(infile_ml1, index_col = 0) 
-    my_features_df  = my_features_df .reset_index(drop = True)
-    my_features_df.index
-    
-    my_features_df.dtypes
-    mycols = my_features_df.columns
-    
-    #---------
-    # File 2
-    #---------
-    infile_aaindex = outdir + 'aa_index/' + gene.lower() + '_aa.csv' 
-    aaindex_df = pd.read_csv(infile_aaindex, index_col = 0) 
-    aaindex_df.dtypes
-    
-    #-----------
-    # check for non-numerical columns
-    #-----------
-    if any(aaindex_df.dtypes==object):
-        print('\naaindex_df contains non-numerical data')
-    
-    aaindex_df_object = aaindex_df.select_dtypes(include = cat_type)
-    print('\nTotal no. of non-numerial columns:', len(aaindex_df_object.columns))
-    
-    expected_aa_ncols = len(aaindex_df.columns) - len(aaindex_df_object.columns)
-
-    #-----------
-    # Extract numerical data only
-    #-----------
-    print('\nSelecting numerical data only')
-    aaindex_df = aaindex_df.select_dtypes(include = num_type)
-
-    #---------------------------
-    # aaindex: sanity check 1
-    #---------------------------
-    if len(aaindex_df.columns) == expected_aa_ncols:
-        print('\nPASS: successfully selected numerical columns only for aaindex_df')
-    else:
-        print('\nFAIL: Numbers mismatch'
-              , '\nExpected ncols:', expected_aa_ncols
-              , '\nGot:', len(aaindex_df.columns))    
-        
-    #---------------
-    # check for NA
-    #---------------
-    print('\nNow checking for NA in the remaining aaindex_cols')
-    c1 = aaindex_df.isna().sum()
-    c2 = c1.sort_values(ascending=False)
-    print('\nCounting aaindex_df cols with NA'
-          , '\nncols with NA:', sum(c2>0), 'columns'
-          , '\nDropping these...'
-          , '\nOriginal ncols:', len(aaindex_df.columns)
-          )
-    aa_df = aaindex_df.dropna(axis=1)
-    
-    print('\nRevised df ncols:', len(aa_df.columns))
-    
-    c3 = aa_df.isna().sum()
-    c4 = c3.sort_values(ascending=False)
-    
-    print('\nChecking NA in revised df...')
-    
-    if sum(c4>0):
-        sys.exit('\nFAIL: aaindex_df still contains cols with NA, please check and drop these before proceeding...')
-    else:
-        print('\nPASS: cols with NA successfully dropped from aaindex_df'
-              , '\nProceeding with combining aa_df with other features_df')
-        
-    #---------------------------
-    # aaindex: sanity check 2
-    #---------------------------
-    expected_aa_ncols2 =  len(aaindex_df.columns) - sum(c2>0)  
-    if len(aa_df.columns) == expected_aa_ncols2:
-        print('\nPASS: ncols match'
-              , '\nExpected ncols:', expected_aa_ncols2
-              , '\nGot:', len(aa_df.columns))
-    else:
-        print('\nFAIL: Numbers mismatch'
-              , '\nExpected ncols:', expected_aa_ncols2
-              , '\nGot:', len(aa_df.columns))            
-        
-    # Important: need this to identify aaindex cols    
-    aa_df_cols = aa_df.columns
-    print('\nTotal no. of columns in clean aa_df:', len(aa_df_cols))
-    
-    ###############################################################################
-    #%% Combining my_features_df and aaindex_df
-    #===========================
-    # Merge my_df + aaindex_df
-    #===========================
-    
-    if aa_df.columns[aa_df.columns.isin(my_features_df.columns)] == my_features_df.columns[my_features_df.columns.isin(aa_df.columns)]:
-        print('\nMerging on column: mutationinformation')   
-    
-    if len(my_features_df) == len(aa_df):
-        expected_nrows = len(my_features_df)
-        print('\nProceeding to merge, expected nrows in merged_df:', expected_nrows)
-    else:
-        sys.exit('\nNrows mismatch, cannot merge. Please check'
-              , '\nnrows my_df:', len(my_features_df)
-              , '\nnrows aa_df:', len(aa_df))
-               
-    #-----------------
-    # Reset index: mutationinformation
-    # Very important for merging
-    #-----------------
-    aa_df = aa_df.reset_index()
-    
-    expected_ncols = len(my_features_df.columns) + len(aa_df.columns) - 1 # for the no. of merging col
-
-    #-----------------
-    # Merge: my_features_df + aa_df
-    #-----------------
-    merged_df = pd.merge(my_features_df
-                         , aa_df
-                         , on = 'mutationinformation')
-    
-    #---------------------------
-    # aaindex: sanity check 3
-    #---------------------------
-    if len(merged_df.columns) == expected_ncols:
-        print('\nPASS: my_features_df and aa_df successfully combined'
-              , '\nnrows:', len(merged_df)
-              , '\nncols:', len(merged_df.columns))
-    else:
-        sys.exit('\nFAIL: could not combine my_features_df and aa_df'
-                 , '\nCheck dims and merging cols!')
-        
-    #--------
-    # Reassign so downstream code doesn't need to change
-    #--------
-    my_df = merged_df.copy()
-    
-    #%% Data: my_df
-    # Check if non structural pos have crept in
-    # IDEALLY remove from source! But for rpoB do it here
-    # Drop NA where numerical cols have them
-    if gene.lower() in geneL_na_ppi2:
-        #D1148 get rid of
-        na_index = my_df['mutationinformation'].index[my_df['mcsm_na_affinity'].apply(np.isnan)]
-        my_df = my_df.drop(index=na_index)
-    
-    # FIXED: complete data for all muts inc L114M, F115L, V123L, V125I, V131M
-    # if gene.lower() in ['embb']:
-    #     na_index = my_df['mutationinformation'].index[my_df['ligand_distance'].apply(np.isnan)]
-    #     my_df = my_df.drop(index=na_index)
-    
-    # # Sanity check for non-structural positions
-    # print('\nChecking for non-structural postions')
-    # na_index = my_df['mutationinformation'].index[my_df['ligand_distance'].apply(np.isnan)]
-    # if len(na_index) > 0:
-    #     print('\nNon-structural positions detected for gene:', gene.lower()
-    #           , '\nTotal number of these detected:', len(na_index)
-    #           , '\These are at index:', na_index
-    #           , '\nOriginal nrows:', len(my_df)
-    #           , '\nDropping these...')
-    #     my_df = my_df.drop(index=na_index)
-    #     print('\nRevised nrows:', len(my_df))
-    # else:
-    #     print('\nNo non-structural positions detected for gene:', gene.lower()
-    #           , '\nnrows:', len(my_df))
-              
-    
-    ###########################################################################
-    #%% Add lineage calculation columns
-    #FIXME: Check if this can be imported from config?
-    total_mtblineage_uc = 8
-    lineage_colnames = ['lineage_list_all', 'lineage_count_all', 'lineage_count_unique', 'lineage_list_unique', 'lineage_multimode']
-    #bar = my_df[lineage_colnames]
-    my_df['lineage_proportion']      = my_df['lineage_count_unique']/my_df['lineage_count_all']
-    my_df['dist_lineage_proportion'] = my_df['lineage_count_unique']/total_mtblineage_uc
-    ###########################################################################
-    #%% Active site annotation column
-    # change from numberic to categorical
-
-    if my_df['active_site'].dtype in num_type:
-        my_df['active_site'] = my_df['active_site'].astype(object)
-        my_df['active_site'].dtype
-    #%% AA property change
-    #--------------------
-    # Water prop change
-    #--------------------
-    my_df['water_change'] = my_df['wt_prop_water'] + str('_to_') + my_df['mut_prop_water']
-    my_df['water_change'].value_counts()
-    
-    water_prop_changeD = {
-        'hydrophobic_to_neutral'          : 'change'
-        , 'hydrophobic_to_hydrophobic'    : 'no_change'
-        , 'neutral_to_neutral'            : 'no_change'
-        , 'neutral_to_hydrophobic'        : 'change'
-        , 'hydrophobic_to_hydrophilic'    : 'change'
-        , 'neutral_to_hydrophilic'        : 'change'
-        , 'hydrophilic_to_neutral'        : 'change'
-        , 'hydrophilic_to_hydrophobic'    : 'change'
-        , 'hydrophilic_to_hydrophilic'    : 'no_change'
-    }
-    
-    my_df['water_change'] = my_df['water_change'].map(water_prop_changeD)
-    my_df['water_change'].value_counts()
-    
-    #--------------------
-    # Polarity change
-    #--------------------
-    my_df['polarity_change'] = my_df['wt_prop_polarity'] + str('_to_') + my_df['mut_prop_polarity']
-    my_df['polarity_change'].value_counts()
-    
-    polarity_prop_changeD = {
-        'non-polar_to_non-polar'     : 'no_change'
-        , 'non-polar_to_neutral'     : 'change'  
-        , 'neutral_to_non-polar'     : 'change'  
-        , 'neutral_to_neutral'       : 'no_change'  
-        , 'non-polar_to_basic'       : 'change'  
-        , 'acidic_to_neutral'        : 'change'  
-        , 'basic_to_neutral'         : 'change'  
-        , 'non-polar_to_acidic'      : 'change'  
-        , 'neutral_to_basic'         : 'change'  
-        , 'acidic_to_non-polar'      : 'change'  
-        , 'basic_to_non-polar'       : 'change'
-        , 'neutral_to_acidic'        : 'change'
-        , 'acidic_to_acidic'         : 'no_change'
-        , 'basic_to_acidic'          : 'change'
-        , 'basic_to_basic'           : 'no_change'
-        , 'acidic_to_basic'          : 'change'}
-    
-    my_df['polarity_change'] = my_df['polarity_change'].map(polarity_prop_changeD)
-    my_df['polarity_change'].value_counts()
-    
-    #--------------------
-    # Electrostatics change
-    #--------------------
-    my_df['electrostatics_change'] = my_df['wt_calcprop'] + str('_to_') + my_df['mut_calcprop']
-    my_df['electrostatics_change'].value_counts()
-    
-    calc_prop_changeD = {
-            'non-polar_to_non-polar'     : 'no_change'
-            , 'non-polar_to_polar'       : 'change'
-            , 'polar_to_non-polar'       : 'change'
-            , 'non-polar_to_pos'         : 'change'
-            , 'neg_to_non-polar'         : 'change'
-            , 'non-polar_to_neg'         : 'change'
-            , 'pos_to_polar'             : 'change'
-            , 'pos_to_non-polar'         : 'change'
-            , 'polar_to_polar'           : 'no_change'
-            , 'neg_to_neg'               : 'no_change'
-            , 'polar_to_neg'             : 'change'
-            , 'pos_to_neg'               : 'change'
-            , 'pos_to_pos'               : 'no_change'
-            , 'polar_to_pos'             : 'change'
-            , 'neg_to_polar'             : 'change'
-            , 'neg_to_pos'               : 'change'
-    }
-    
-    my_df['electrostatics_change'] = my_df['electrostatics_change'].map(calc_prop_changeD)
-    my_df['electrostatics_change'].value_counts()
-    
-    #--------------------    
-    # Summary change: Create a combined column summarising these three cols
-    #--------------------
-    detect_change = 'change'
-    check_prop_cols = ['water_change', 'polarity_change', 'electrostatics_change']
-    #my_df['aa_prop_change'] = (my_df.values == detect_change).any(1).astype(int)
-    my_df['aa_prop_change'] = (my_df[check_prop_cols].values == detect_change).any(1).astype(int)
-    my_df['aa_prop_change'].value_counts()
-    my_df['aa_prop_change'].dtype
-    
-    my_df['aa_prop_change'] = my_df['aa_prop_change'].map({1:'change'
-                                                           , 0: 'no_change'})
-    
-    my_df['aa_prop_change'].value_counts()
-    my_df['aa_prop_change'].dtype
-    
-    #%% IMPUTE values for OR [check script for exploration: UQ_or_imputer]
-    #--------------------
-    # Impute OR values
-    #--------------------
-    #or_cols = ['or_mychisq', 'log10_or_mychisq', 'or_fisher']
-    sel_cols = ['mutationinformation', 'or_mychisq', 'log10_or_mychisq']
-    or_cols = ['or_mychisq', 'log10_or_mychisq']
-    
-    print("count of NULL values before imputation\n")
-    print(my_df[or_cols].isnull().sum())
-    
-    my_dfI = pd.DataFrame(index = my_df['mutationinformation'] )
-    
-        
-    my_dfI = pd.DataFrame(KNN(n_neighbors=3, weights="uniform").fit_transform(my_df[or_cols])
-                          , index =  my_df['mutationinformation']
-                          , columns = or_cols )
-    my_dfI.columns = ['or_rawI', 'logorI']
-    my_dfI.columns
-    my_dfI = my_dfI.reset_index(drop = False) # prevents old index from being added as a column
-    my_dfI.head()
-    print("count of NULL values AFTER imputation\n")
-    print(my_dfI.isnull().sum())
-    
-    #-------------------------------------------
-    # OR df Merge: with original based on index
-    #-------------------------------------------
-    #my_df['index_bm'] = my_df.index
-    mydf_imputed = pd.merge(my_df
-                        , my_dfI
-                        , on = 'mutationinformation')
-    #mydf_imputed = mydf_imputed.set_index(['index_bm'])
-    
-    my_df['log10_or_mychisq'].isna().sum()
-    mydf_imputed['log10_or_mychisq'].isna().sum()
-    mydf_imputed['logorI'].isna().sum() # should be 0
-    
-    len(my_df.columns)
-    len(mydf_imputed.columns)  
-    
-    #-----------------------------------------
-    # REASSIGN my_df after imputing OR values
-    #-----------------------------------------
-    my_df = mydf_imputed.copy()
-    
-    if my_df['logorI'].isna().sum() == 0:
-        print('\nPASS: OR values imputed, data ready for ML')
-    else:
-        sys.exit('\nFAIL: something went wrong, Data not ready for ML. Please check upstream!')
-    
-    #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-    #---------------------------------------
-    # TODO: try other imputation like MICE
-    #---------------------------------------
-    #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-    
-    #%%########################################################################
-    #==========================
-    #     Data for ML
-    #==========================
-    my_df_ml = my_df.copy()
-    
-    #%% Build X: input for ML
-    common_cols_stabiltyN = ['ligand_distance'
-               , 'ligand_affinity_change'
-               , 'duet_stability_change'
-               , 'ddg_foldx'
-               , 'deepddg'
-               , 'ddg_dynamut2'
-               , 'mmcsm_lig'
-               , 'contacts']
-    
-    # Build stability columns ~ gene
-    if gene.lower() in geneL_basic:
-        X_stabilityN = common_cols_stabiltyN
-        cols_to_mask = ['ligand_affinity_change']
-        
-    if gene.lower() in geneL_ppi2:
-    #    X_stabilityN = common_cols_stabiltyN + ['mcsm_ppi2_affinity' , 'interface_dist'] 
-        geneL_ppi2_st_cols = ['mcsm_ppi2_affinity', 'interface_dist'] 
-        X_stabilityN = common_cols_stabiltyN + geneL_ppi2_st_cols
-        cols_to_mask = ['ligand_affinity_change', 'mcsm_ppi2_affinity']
-    
-    if gene.lower() in geneL_na:
-    #    X_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] 
-        geneL_na_st_cols =  ['mcsm_na_affinity'] 
-        X_stabilityN = common_cols_stabiltyN + geneL_na_st_cols
-        cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity']
-    
-    if gene.lower() in geneL_na_ppi2:
-    #    X_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] 
-        geneL_na_ppi2_st_cols = ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] 
-        X_stabilityN = common_cols_stabiltyN + geneL_na_ppi2_st_cols
-        cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity', 'mcsm_ppi2_affinity']
-    
-    
-    X_foldX_cols = [ 'electro_rr', 'electro_mm', 'electro_sm', 'electro_ss'
-    , 'disulfide_rr', 'disulfide_mm', 'disulfide_sm', 'disulfide_ss'
-    , 'hbonds_rr', 'hbonds_mm', 'hbonds_sm', 'hbonds_ss'
-    , 'partcov_rr', 'partcov_mm', 'partcov_sm', 'partcov_ss'
-    , 'vdwclashes_rr', 'vdwclashes_mm', 'vdwclashes_sm', 'vdwclashes_ss'
-    , 'volumetric_rr', 'volumetric_mm', 'volumetric_ss'
-    ]
-    
-    X_str =  ['rsa'
-               #, 'asa'
-               , 'kd_values'
-               , 'rd_values']    
-    
-    X_ssFN = X_stabilityN + X_str + X_foldX_cols
-    
-    X_evolFN =  ['consurf_score'
-               , 'snap2_score'
-               , 'provean_score']
-        
-    X_genomic_mafor =  ['maf'
-                    , 'logorI'
-                    # , 'or_rawI'
-                    # , 'or_mychisq'
-                    # , 'or_logistic'
-                    # , 'or_fisher'
-                    # , 'pval_fisher'
-                    ]
-    
-    X_genomic_linegae  = ['lineage_proportion'
-                          , 'dist_lineage_proportion'
-                          #, 'lineage' # could be included as a category but it has L2;L4  formatting
-                          , 'lineage_count_all'
-                          , 'lineage_count_unique'
-                          ]
-    
-    X_genomicFN = X_genomic_mafor + X_genomic_linegae
-    
-    X_aaindexFN = list(aa_df_cols)
-    
-    print('\nTotal no. of features for aaindex:', len(X_aaindexFN))
-    
-    # numerical feature names
-    numerical_FN = X_ssFN  + X_evolFN + X_genomicFN + X_aaindexFN
-
-    
-    # categorical feature names
-    categorical_FN = ['ss_class'
-                # , 'wt_prop_water'
-                # , 'mut_prop_water'
-                # , 'wt_prop_polarity'
-                # , 'mut_prop_polarity'
-                # , 'wt_calcprop'
-                # , 'mut_calcprop'
-                , 'aa_prop_change'
-                , 'electrostatics_change'
-                , 'polarity_change'
-                , 'water_change'
-                , 'drtype_mode_labels' # beware then you can't use it to predict [USED it for uq_v1, not v2]
-                , 'active_site' #[didn't use it for uq_v1]
-                #, 'gene_name' # will be required for the combined stuff
-                 ]
-    #----------------------------------------------
-    # count numerical and categorical features
-    #----------------------------------------------
-    
-    print('\nNo. of numerical features:', len(numerical_FN)
-          , '\nNo. of categorical features:', len(categorical_FN))
-    
-    ###########################################################################
-    #=======================
-    # Masking columns:
-    # (mCSM-lig, mCSM-NA, mCSM-ppi2) values for lig_dist >10
-    #=======================
-    # my_df_ml['mutationinformation'][my_df['ligand_distance']>10].value_counts()
-    # my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts()
-    
-    # my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), 'ligand_affinity_change'] = 0
-    # (my_df_ml['ligand_affinity_change'] == 0).sum()
-    
-    my_df_ml['mutationinformation'][my_df_ml['ligand_distance']>10].value_counts()
-    my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts()
-    my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask].value_counts()
-    
-    # mask the mcsm affinity related columns where ligand distance > 10
-    my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask] = 0
-    (my_df_ml['ligand_affinity_change'] == 0).sum()
-    
-    mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask]  
-    
-    # write file for check
-    mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
-    mask_check.to_csv(outdir + 'ml/' + gene.lower() + '_mask_check.csv')
-    
-    #===================================================
-    # Training and BLIND test set: actual vs imputed
-    # ORIGINAL i.e.
-    # dst with actual values  : training set
-    # dst with imputed values : blind test
-    #==================================================
-    my_df_ml[drug].isna().sum()  #'na' ones are the blind_test set
-    
-    blind_test_df = my_df_ml[my_df_ml[drug].isna()]
-    blind_test_df.shape
-
-    training_df = my_df_ml[my_df_ml[drug].notna()]
-    training_df.shape
-    
-    # Target 1: dst_mode
-    training_df[drug].value_counts()
-    training_df['dst_mode'].value_counts()
-    ####################################################################
-
-    #============
-    # ML data
-    #============
-    #------
-    # X: Training and Blind test (BTS)
-    #------
-    X     = training_df[numerical_FN + categorical_FN] # training data ALL
-    X_bts = blind_test_df[numerical_FN + categorical_FN] # blind test data ALL
-    
-    #------
-    # y
-    #------
-    y     = training_df['dst_mode'] # training data y
-    y_bts = blind_test_df['dst_mode'] # blind data test y
-        
-    # Quick check
-    #(X['ligand_affinity_change']==0).sum() == (X['ligand_distance']>10).sum()
-    for i in range(len(cols_to_mask)):
-        ind = i+1
-        print('\nindex:', i, '\nind:', ind)
-        print('\nMask count check:'
-              , (my_df_ml[cols_to_mask[i]]==0).sum() == (my_df_ml['ligand_distance']>10).sum()
-              )
-    
-    print('Original Data\n', Counter(y)
-          , 'Data dim:', X.shape)
-    
-    yc1 = Counter(y)
-    yc1_ratio = yc1[0]/yc1[1]
-    
-    yc2 = Counter(y_bts)
-    yc2_ratio = yc2[0]/yc2[1]
-    
-    print('\n-------------------------------------------------------------'
-          , '\nSuccessfully split data: ORIGINAL training'
-          , '\nactual values: training set'
-          , '\nimputed values: blind test set'
-          , '\nTrain data size:', X.shape
-          , '\nTest data size:', X_bts.shape
-          , '\ny_train numbers:', yc1
-          , '\ny_train ratio:',yc1_ratio
-          , '\n'
-          , '\ny_test_numbers:', yc2
-          , '\ny_test ratio:', yc2_ratio
-          , '\n-------------------------------------------------------------'
-          )
-    ###########################################################################
-    #%% 
-    ###########################################################################
-    #                               RESAMPLING
-    ###########################################################################
-    #------------------------------
-    # Simple Random oversampling
-    # [Numerical + catgeorical]
-    #------------------------------
-    oversample = RandomOverSampler(sampling_strategy='minority')
-    X_ros, y_ros = oversample.fit_resample(X, y)
-    print('Simple Random OverSampling\n', Counter(y_ros))
-    print(X_ros.shape)
-    
-    #------------------------------
-    # Simple Random Undersampling
-    # [Numerical + catgeorical]
-    #------------------------------
-    undersample = RandomUnderSampler(sampling_strategy='majority')
-    X_rus, y_rus = undersample.fit_resample(X, y)
-    print('Simple Random UnderSampling\n', Counter(y_rus))
-    print(X_rus.shape)
-    
-    #------------------------------
-    # Simple combine ROS and RUS
-    # [Numerical + catgeorical]
-    #------------------------------
-    oversample = RandomOverSampler(sampling_strategy='minority')
-    X_ros, y_ros = oversample.fit_resample(X, y)
-    undersample = RandomUnderSampler(sampling_strategy='majority')
-    X_rouC, y_rouC = undersample.fit_resample(X_ros, y_ros)
-    print('Simple Combined Over and UnderSampling\n',  Counter(y_rouC))
-    print(X_rouC.shape)
-    
-    #------------------------------
-    # SMOTE_NC: oversampling 
-    # [numerical + categorical]
-    #https://stackoverflow.com/questions/47655813/oversampling-smote-for-binary-and-categorical-data-in-python
-    #------------------------------
-    # Determine categorical and numerical features
-    numerical_ix = X.select_dtypes(include=['int64', 'float64']).columns
-    numerical_ix
-    num_featuresL = list(numerical_ix)
-    numerical_colind = X.columns.get_indexer(list(numerical_ix) )
-    numerical_colind
-    
-    categorical_ix = X.select_dtypes(include=['object', 'bool']).columns
-    categorical_ix    
-    categorical_colind = X.columns.get_indexer(list(categorical_ix))
-    categorical_colind
-    
-    k_sm = 5 # 5 is deafult
-    sm_nc = SMOTENC(categorical_features=categorical_colind, k_neighbors = k_sm, **rs, **njobs)
-    X_smnc, y_smnc = sm_nc.fit_resample(X, y)
-    print('SMOTE_NC OverSampling\n', Counter(y_smnc))
-    print(X_smnc.shape)
-    globals().update(locals()) # TROLOLOLOLOLOLS
-    #print("i did a horrible hack :-)")
-    ###############################################################################
-    #%% SMOTE RESAMPLING for NUMERICAL ONLY*
-    # #------------------------------
-    # # SMOTE: Oversampling
-    # # [Numerical ONLY]
-    # #------------------------------
-    # k_sm = 1
-    # sm = SMOTE(sampling_strategy = 'auto', k_neighbors = k_sm, **rs)
-    # X_sm, y_sm = sm.fit_resample(X, y)
-    # print(X_sm.shape)
-    # print('SMOTE OverSampling\n', Counter(y_sm))
-    # y_sm_df = y_sm.to_frame()
-    # y_sm_df.value_counts().plot(kind = 'bar')
-    
-    # #------------------------------
-    # # SMOTE: Over + Undersampling COMBINED
-    # # [Numerical ONLY]
-    # #-----------------------------
-    # sm_enn = SMOTEENN(enn=EditedNearestNeighbours(sampling_strategy='all', **rs, **njobs ))
-    # X_enn, y_enn = sm_enn.fit_resample(X, y)
-    # print(X_enn.shape)
-    # print('SMOTE Over+Under Sampling combined\n', Counter(y_enn))
-    
-    ###############################################################################
-    # TODO: Find over and undersampling JUST for categorical data
--- a/scripts/ml/ml_data_rt.py
+++ b/scripts/ml/ml_data_rt.py
@ -1,735 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Sun Mar  6 13:41:54 2022
-
-@author: tanu
-"""
-def setvars(gene,drug):
-    #https://stackoverflow.com/questions/51695322/compare-multiple-algorithms-with-sklearn-pipeline
-    import os, sys
-    import pandas as pd
-    import numpy as np
-    print(np.__version__)
-    print(pd.__version__)
-    import pprint as pp
-    from copy import deepcopy
-    from collections import Counter
-    from sklearn.impute import KNNImputer as KNN
-    from imblearn.over_sampling import RandomOverSampler
-    from imblearn.under_sampling import RandomUnderSampler
-    from imblearn.over_sampling import SMOTE
-    from sklearn.datasets import make_classification
-    from imblearn.combine import SMOTEENN
-    from imblearn.combine import SMOTETomek
-    
-    from imblearn.over_sampling import SMOTENC
-    from imblearn.under_sampling import EditedNearestNeighbours
-    from imblearn.under_sampling import RepeatedEditedNearestNeighbours
-    
-    from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score
-    from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report
-    
-    from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
-    from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold
-    
-    from sklearn.pipeline import Pipeline, make_pipeline
-    #%% GLOBALS
-    rs = {'random_state': 42}
-    njobs = {'n_jobs': 10}
-    
-    scoring_fn =  ({ 'mcc'         : make_scorer(matthews_corrcoef)
-                    , 'accuracy'   : make_scorer(accuracy_score)
-                    , 'fscore'     : make_scorer(f1_score)
-                    , 'precision'  : make_scorer(precision_score)
-                    , 'recall'     : make_scorer(recall_score)
-                    , 'roc_auc'    : make_scorer(roc_auc_score)
-                    , 'jcc'        : make_scorer(jaccard_score)
-                }) 
-      
-    skf_cv = StratifiedKFold(n_splits = 10
-                              #, shuffle = False, random_state= None)
-                               , shuffle = True,**rs)
-    
-    rskf_cv = RepeatedStratifiedKFold(n_splits = 10
-                                      , n_repeats = 3
-                                      , **rs)
-    
-    mcc_score_fn  = {'mcc': make_scorer(matthews_corrcoef)}
-    jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
-    
-    #%% FOR LATER: Combine ED logo data
-    #%% DONE: active aa site annotations **DONE on 15/05/2022 as part of generating merged_dfs
-    ###########################################################################
-    rs = {'random_state': 42}
-    njobs = {'n_jobs': 10}
-    homedir = os.path.expanduser("~")
-    
-    geneL_basic     = ['pnca']
-    geneL_na        = ['gid']
-    geneL_na_ppi2   = ['rpob']
-    geneL_ppi2      = ['alr', 'embb', 'katg']
-    
-    #num_type = ['int64', 'float64']
-    num_type = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
-    cat_type = ['object', 'bool']
-    
-    #==============
-    # directories
-    #==============
-    datadir = homedir + '/git/Data/'
-    indir   = datadir + drug + '/input/'
-    outdir  = datadir + drug + '/output/'
-    
-    #=======
-    # input
-    #=======
-    
-    #---------
-    # File 1
-    #---------
-    infile_ml1 = outdir + gene.lower() + '_merged_df3.csv' 
-    #infile_ml2 = outdir + gene.lower() + '_merged_df2.csv'
-    
-    my_features_df = pd.read_csv(infile_ml1, index_col = 0) 
-    my_features_df  = my_features_df .reset_index(drop = True)
-    my_features_df.index
-    
-    my_features_df.dtypes
-    mycols = my_features_df.columns
-    
-    #---------
-    # File 2
-    #---------
-    infile_aaindex = outdir + 'aa_index/' + gene.lower() + '_aa.csv' 
-    aaindex_df = pd.read_csv(infile_aaindex, index_col = 0) 
-    aaindex_df.dtypes
-    
-    #-----------
-    # check for non-numerical columns
-    #-----------
-    if any(aaindex_df.dtypes==object):
-        print('\naaindex_df contains non-numerical data')
-    
-    aaindex_df_object = aaindex_df.select_dtypes(include = cat_type)
-    print('\nTotal no. of non-numerial columns:', len(aaindex_df_object.columns))
-    
-    expected_aa_ncols = len(aaindex_df.columns) - len(aaindex_df_object.columns)
-
-    #-----------
-    # Extract numerical data only
-    #-----------
-    print('\nSelecting numerical data only')
-    aaindex_df = aaindex_df.select_dtypes(include = num_type)
-
-    #---------------------------
-    # aaindex: sanity check 1
-    #---------------------------
-    if len(aaindex_df.columns) == expected_aa_ncols:
-        print('\nPASS: successfully selected numerical columns only for aaindex_df')
-    else:
-        print('\nFAIL: Numbers mismatch'
-              , '\nExpected ncols:', expected_aa_ncols
-              , '\nGot:', len(aaindex_df.columns))    
-        
-    #---------------
-    # check for NA
-    #---------------
-    print('\nNow checking for NA in the remaining aaindex_cols')
-    c1 = aaindex_df.isna().sum()
-    c2 = c1.sort_values(ascending=False)
-    print('\nCounting aaindex_df cols with NA'
-          , '\nncols with NA:', sum(c2>0), 'columns'
-          , '\nDropping these...'
-          , '\nOriginal ncols:', len(aaindex_df.columns)
-          )
-    aa_df = aaindex_df.dropna(axis=1)
-    
-    print('\nRevised df ncols:', len(aa_df.columns))
-    
-    c3 = aa_df.isna().sum()
-    c4 = c3.sort_values(ascending=False)
-    
-    print('\nChecking NA in revised df...')
-    
-    if sum(c4>0):
-        sys.exit('\nFAIL: aaindex_df still contains cols with NA, please check and drop these before proceeding...')
-    else:
-        print('\nPASS: cols with NA successfully dropped from aaindex_df'
-              , '\nProceeding with combining aa_df with other features_df')
-        
-    #---------------------------
-    # aaindex: sanity check 2
-    #---------------------------
-    expected_aa_ncols2 =  len(aaindex_df.columns) - sum(c2>0)  
-    if len(aa_df.columns) == expected_aa_ncols2:
-        print('\nPASS: ncols match'
-              , '\nExpected ncols:', expected_aa_ncols2
-              , '\nGot:', len(aa_df.columns))
-    else:
-        print('\nFAIL: Numbers mismatch'
-              , '\nExpected ncols:', expected_aa_ncols2
-              , '\nGot:', len(aa_df.columns))            
-        
-    # Important: need this to identify aaindex cols    
-    aa_df_cols = aa_df.columns
-    print('\nTotal no. of columns in clean aa_df:', len(aa_df_cols))
-    
-    ###############################################################################
-    #%% Combining my_features_df and aaindex_df
-    #===========================
-    # Merge my_df + aaindex_df
-    #===========================
-    
-    if aa_df.columns[aa_df.columns.isin(my_features_df.columns)] == my_features_df.columns[my_features_df.columns.isin(aa_df.columns)]:
-        print('\nMerging on column: mutationinformation')   
-    
-    if len(my_features_df) == len(aa_df):
-        expected_nrows = len(my_features_df)
-        print('\nProceeding to merge, expected nrows in merged_df:', expected_nrows)
-    else:
-        sys.exit('\nNrows mismatch, cannot merge. Please check'
-              , '\nnrows my_df:', len(my_features_df)
-              , '\nnrows aa_df:', len(aa_df))
-               
-    #-----------------
-    # Reset index: mutationinformation
-    # Very important for merging
-    #-----------------
-    aa_df = aa_df.reset_index()
-    
-    expected_ncols = len(my_features_df.columns) + len(aa_df.columns) - 1 # for the no. of merging col
-
-    #-----------------
-    # Merge: my_features_df + aa_df
-    #-----------------
-    merged_df = pd.merge(my_features_df
-                         , aa_df
-                         , on = 'mutationinformation')
-    
-    #---------------------------
-    # aaindex: sanity check 3
-    #---------------------------
-    if len(merged_df.columns) == expected_ncols:
-        print('\nPASS: my_features_df and aa_df successfully combined'
-              , '\nnrows:', len(merged_df)
-              , '\nncols:', len(merged_df.columns))
-    else:
-        sys.exit('\nFAIL: could not combine my_features_df and aa_df'
-                 , '\nCheck dims and merging cols!')
-        
-    #--------
-    # Reassign so downstream code doesn't need to change
-    #--------
-    my_df = merged_df.copy()
-    
-    #%% Data: my_df
-    # Check if non structural pos have crept in
-    # IDEALLY remove from source! But for rpoB do it here
-    # Drop NA where numerical cols have them
-    if gene.lower() in geneL_na_ppi2:
-        #D1148 get rid of
-        na_index = my_df['mutationinformation'].index[my_df['mcsm_na_affinity'].apply(np.isnan)]
-        my_df = my_df.drop(index=na_index)
-    
-    # FIXED: complete data for all muts inc L114M, F115L, V123L, V125I, V131M
-    # if gene.lower() in ['embb']:
-    #     na_index = my_df['mutationinformation'].index[my_df['ligand_distance'].apply(np.isnan)]
-    #     my_df = my_df.drop(index=na_index)
-    
-    # # Sanity check for non-structural positions
-    # print('\nChecking for non-structural postions')
-    # na_index = my_df['mutationinformation'].index[my_df['ligand_distance'].apply(np.isnan)]
-    # if len(na_index) > 0:
-    #     print('\nNon-structural positions detected for gene:', gene.lower()
-    #           , '\nTotal number of these detected:', len(na_index)
-    #           , '\These are at index:', na_index
-    #           , '\nOriginal nrows:', len(my_df)
-    #           , '\nDropping these...')
-    #     my_df = my_df.drop(index=na_index)
-    #     print('\nRevised nrows:', len(my_df))
-    # else:
-    #     print('\nNo non-structural positions detected for gene:', gene.lower()
-    #           , '\nnrows:', len(my_df))
-              
-    
-    ###########################################################################
-    #%% Add lineage calculation columns
-    #FIXME: Check if this can be imported from config?
-    total_mtblineage_uc = 8
-    lineage_colnames = ['lineage_list_all', 'lineage_count_all', 'lineage_count_unique', 'lineage_list_unique', 'lineage_multimode']
-    #bar = my_df[lineage_colnames]
-    my_df['lineage_proportion']      = my_df['lineage_count_unique']/my_df['lineage_count_all']
-    my_df['dist_lineage_proportion'] = my_df['lineage_count_unique']/total_mtblineage_uc
-    ###########################################################################
-    #%% Active site annotation column
-    # change from numberic to categorical
-
-    if my_df['active_site'].dtype in num_type:
-        my_df['active_site'] = my_df['active_site'].astype(object)
-        my_df['active_site'].dtype
-    #%% AA property change
-    #--------------------
-    # Water prop change
-    #--------------------
-    my_df['water_change'] = my_df['wt_prop_water'] + str('_to_') + my_df['mut_prop_water']
-    my_df['water_change'].value_counts()
-    
-    water_prop_changeD = {
-        'hydrophobic_to_neutral'          : 'change'
-        , 'hydrophobic_to_hydrophobic'    : 'no_change'
-        , 'neutral_to_neutral'            : 'no_change'
-        , 'neutral_to_hydrophobic'        : 'change'
-        , 'hydrophobic_to_hydrophilic'    : 'change'
-        , 'neutral_to_hydrophilic'        : 'change'
-        , 'hydrophilic_to_neutral'        : 'change'
-        , 'hydrophilic_to_hydrophobic'    : 'change'
-        , 'hydrophilic_to_hydrophilic'    : 'no_change'
-    }
-    
-    my_df['water_change'] = my_df['water_change'].map(water_prop_changeD)
-    my_df['water_change'].value_counts()
-    
-    #--------------------
-    # Polarity change
-    #--------------------
-    my_df['polarity_change'] = my_df['wt_prop_polarity'] + str('_to_') + my_df['mut_prop_polarity']
-    my_df['polarity_change'].value_counts()
-    
-    polarity_prop_changeD = {
-        'non-polar_to_non-polar'     : 'no_change'
-        , 'non-polar_to_neutral'     : 'change'  
-        , 'neutral_to_non-polar'     : 'change'  
-        , 'neutral_to_neutral'       : 'no_change'  
-        , 'non-polar_to_basic'       : 'change'  
-        , 'acidic_to_neutral'        : 'change'  
-        , 'basic_to_neutral'         : 'change'  
-        , 'non-polar_to_acidic'      : 'change'  
-        , 'neutral_to_basic'         : 'change'  
-        , 'acidic_to_non-polar'      : 'change'  
-        , 'basic_to_non-polar'       : 'change'
-        , 'neutral_to_acidic'        : 'change'
-        , 'acidic_to_acidic'         : 'no_change'
-        , 'basic_to_acidic'          : 'change'
-        , 'basic_to_basic'           : 'no_change'
-        , 'acidic_to_basic'          : 'change'}
-    
-    my_df['polarity_change'] = my_df['polarity_change'].map(polarity_prop_changeD)
-    my_df['polarity_change'].value_counts()
-    
-    #--------------------
-    # Electrostatics change
-    #--------------------
-    my_df['electrostatics_change'] = my_df['wt_calcprop'] + str('_to_') + my_df['mut_calcprop']
-    my_df['electrostatics_change'].value_counts()
-    
-    calc_prop_changeD = {
-            'non-polar_to_non-polar'     : 'no_change'
-            , 'non-polar_to_polar'       : 'change'
-            , 'polar_to_non-polar'       : 'change'
-            , 'non-polar_to_pos'         : 'change'
-            , 'neg_to_non-polar'         : 'change'
-            , 'non-polar_to_neg'         : 'change'
-            , 'pos_to_polar'             : 'change'
-            , 'pos_to_non-polar'         : 'change'
-            , 'polar_to_polar'           : 'no_change'
-            , 'neg_to_neg'               : 'no_change'
-            , 'polar_to_neg'             : 'change'
-            , 'pos_to_neg'               : 'change'
-            , 'pos_to_pos'               : 'no_change'
-            , 'polar_to_pos'             : 'change'
-            , 'neg_to_polar'             : 'change'
-            , 'neg_to_pos'               : 'change'
-    }
-    
-    my_df['electrostatics_change'] = my_df['electrostatics_change'].map(calc_prop_changeD)
-    my_df['electrostatics_change'].value_counts()
-    
-    #--------------------    
-    # Summary change: Create a combined column summarising these three cols
-    #--------------------
-    detect_change = 'change'
-    check_prop_cols = ['water_change', 'polarity_change', 'electrostatics_change']
-    #my_df['aa_prop_change'] = (my_df.values == detect_change).any(1).astype(int)
-    my_df['aa_prop_change'] = (my_df[check_prop_cols].values == detect_change).any(1).astype(int)
-    my_df['aa_prop_change'].value_counts()
-    my_df['aa_prop_change'].dtype
-    
-    my_df['aa_prop_change'] = my_df['aa_prop_change'].map({1:'change'
-                                                           , 0: 'no_change'})
-    
-    my_df['aa_prop_change'].value_counts()
-    my_df['aa_prop_change'].dtype
-    
-    #%% IMPUTE values for OR [check script for exploration: UQ_or_imputer]
-    #--------------------
-    # Impute OR values
-    #--------------------
-    #or_cols = ['or_mychisq', 'log10_or_mychisq', 'or_fisher']
-    sel_cols = ['mutationinformation', 'or_mychisq', 'log10_or_mychisq']
-    or_cols = ['or_mychisq', 'log10_or_mychisq']
-    
-    print("count of NULL values before imputation\n")
-    print(my_df[or_cols].isnull().sum())
-    
-    my_dfI = pd.DataFrame(index = my_df['mutationinformation'] )
-    
-        
-    my_dfI = pd.DataFrame(KNN(n_neighbors=3, weights="uniform").fit_transform(my_df[or_cols])
-                          , index =  my_df['mutationinformation']
-                          , columns = or_cols )
-    my_dfI.columns = ['or_rawI', 'logorI']
-    my_dfI.columns
-    my_dfI = my_dfI.reset_index(drop = False) # prevents old index from being added as a column
-    my_dfI.head()
-    print("count of NULL values AFTER imputation\n")
-    print(my_dfI.isnull().sum())
-    
-    #-------------------------------------------
-    # OR df Merge: with original based on index
-    #-------------------------------------------
-    #my_df['index_bm'] = my_df.index
-    mydf_imputed = pd.merge(my_df
-                        , my_dfI
-                        , on = 'mutationinformation')
-    #mydf_imputed = mydf_imputed.set_index(['index_bm'])
-    
-    my_df['log10_or_mychisq'].isna().sum()
-    mydf_imputed['log10_or_mychisq'].isna().sum()
-    mydf_imputed['logorI'].isna().sum() # should be 0
-    
-    len(my_df.columns)
-    len(mydf_imputed.columns)  
-    
-    #-----------------------------------------
-    # REASSIGN my_df after imputing OR values
-    #-----------------------------------------
-    my_df = mydf_imputed.copy()
-    
-    if my_df['logorI'].isna().sum() == 0:
-        print('\nPASS: OR values imputed, data ready for ML')
-    else:
-        sys.exit('\nFAIL: something went wrong, Data not ready for ML. Please check upstream!')
-    
-    #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-    #---------------------------------------
-    # TODO: try other imputation like MICE
-    #---------------------------------------
-    #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-    
-    #%%########################################################################
-    #==========================
-    #     Data for ML
-    #==========================
-    my_df_ml = my_df.copy()
-    
-    #%% Build X: input for ML
-    common_cols_stabiltyN = ['ligand_distance'
-               , 'ligand_affinity_change'
-               , 'duet_stability_change'
-               , 'ddg_foldx'
-               , 'deepddg'
-               , 'ddg_dynamut2'
-               , 'mmcsm_lig'
-               , 'contacts']
-    
-    # Build stability columns ~ gene
-    if gene.lower() in geneL_basic:
-        X_stabilityN = common_cols_stabiltyN
-        cols_to_mask = ['ligand_affinity_change']
-        
-    if gene.lower() in geneL_ppi2:
-    #    X_stabilityN = common_cols_stabiltyN + ['mcsm_ppi2_affinity' , 'interface_dist'] 
-        geneL_ppi2_st_cols = ['mcsm_ppi2_affinity', 'interface_dist'] 
-        X_stabilityN = common_cols_stabiltyN + geneL_ppi2_st_cols
-        cols_to_mask = ['ligand_affinity_change', 'mcsm_ppi2_affinity']
-    
-    if gene.lower() in geneL_na:
-    #    X_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] 
-        geneL_na_st_cols =  ['mcsm_na_affinity'] 
-        X_stabilityN = common_cols_stabiltyN + geneL_na_st_cols
-        cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity']
-    
-    if gene.lower() in geneL_na_ppi2:
-    #    X_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] 
-        geneL_na_ppi2_st_cols = ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] 
-        X_stabilityN = common_cols_stabiltyN + geneL_na_ppi2_st_cols
-        cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity', 'mcsm_ppi2_affinity']
-    
-    
-    X_foldX_cols = [ 'electro_rr', 'electro_mm', 'electro_sm', 'electro_ss'
-    , 'disulfide_rr', 'disulfide_mm', 'disulfide_sm', 'disulfide_ss'
-    , 'hbonds_rr', 'hbonds_mm', 'hbonds_sm', 'hbonds_ss'
-    , 'partcov_rr', 'partcov_mm', 'partcov_sm', 'partcov_ss'
-    , 'vdwclashes_rr', 'vdwclashes_mm', 'vdwclashes_sm', 'vdwclashes_ss'
-    , 'volumetric_rr', 'volumetric_mm', 'volumetric_ss'
-    ]
-    
-    X_str =  ['rsa'
-               #, 'asa'
-               , 'kd_values'
-               , 'rd_values']    
-    
-    X_ssFN = X_stabilityN + X_str + X_foldX_cols
-    
-    X_evolFN =  ['consurf_score'
-               , 'snap2_score'
-               , 'provean_score']
-        
-    X_genomic_mafor =  ['maf'
-                    , 'logorI'
-                    # , 'or_rawI'
-                    # , 'or_mychisq'
-                    # , 'or_logistic'
-                    # , 'or_fisher'
-                    # , 'pval_fisher'
-                    ]
-    
-    X_genomic_linegae  = ['lineage_proportion'
-                          , 'dist_lineage_proportion'
-                          #, 'lineage' # could be included as a category but it has L2;L4  formatting
-                          , 'lineage_count_all'
-                          , 'lineage_count_unique'
-                          ]
-    
-    X_genomicFN = X_genomic_mafor + X_genomic_linegae
-    
-    X_aaindexFN = list(aa_df_cols)
-    
-    print('\nTotal no. of features for aaindex:', len(X_aaindexFN))
-    
-    # numerical feature names
-    numerical_FN = X_ssFN  + X_evolFN + X_genomicFN + X_aaindexFN
-
-    
-    # categorical feature names
-    categorical_FN = ['ss_class'
-                # , 'wt_prop_water'
-                # , 'mut_prop_water'
-                # , 'wt_prop_polarity'
-                # , 'mut_prop_polarity'
-                # , 'wt_calcprop'
-                # , 'mut_calcprop'
-                , 'aa_prop_change'
-                , 'electrostatics_change'
-                , 'polarity_change'
-                , 'water_change'
-                , 'drtype_mode_labels' # beware then you can't use it to predict [USED it for uq_v1, not v2]
-                , 'active_site' #[didn't use it for uq_v1]
-                #, 'gene_name' # will be required for the combined stuff
-                 ]
-    #----------------------------------------------
-    # count numerical and categorical features
-    #----------------------------------------------
-    
-    print('\nNo. of numerical features:', len(numerical_FN)
-          , '\nNo. of categorical features:', len(categorical_FN))
-    
-    ###########################################################################
-    #=======================
-    # Masking columns:
-    # (mCSM-lig, mCSM-NA, mCSM-ppi2) values for lig_dist >10
-    #=======================
-    # my_df_ml['mutationinformation'][my_df['ligand_distance']>10].value_counts()
-    # my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts()
-    
-    # my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), 'ligand_affinity_change'] = 0
-    # (my_df_ml['ligand_affinity_change'] == 0).sum()
-    
-    my_df_ml['mutationinformation'][my_df_ml['ligand_distance']>10].value_counts()
-    my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts()
-    my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask].value_counts()
-    
-    # mask the mcsm affinity related columns where ligand distance > 10
-    my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask] = 0
-    (my_df_ml['ligand_affinity_change'] == 0).sum()
-    
-    mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask]  
-    
-    # write file for check
-    mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
-    mask_check.to_csv(outdir + 'ml/' + gene.lower() + '_mask_check.csv')
-    
-    #=================================================
-    # Training and BLIND test set: imputed vs actual
-    # BUT in REVERSE i.e.
-    # dst with actual values  : blind test
-    # dst with imputed values : training set
-    #==================================================
-    my_df_ml[drug].isna().sum()  #'na' ones are now training set
-    
-    blind_test_df = my_df_ml[my_df_ml[drug].notna()]
-    blind_test_df.shape
-
-    training_df = my_df_ml[my_df_ml[drug].isna()]
-    training_df.shape
-    
-    # Target 1: dst_mode
-    training_df[drug].value_counts()
-    training_df['dst_mode'].value_counts()
-    ####################################################################
-    #%% extracting dfs based on numerical, categorical column names
-    #----------------------------------
-    # WITHOUT the target var included
-    #----------------------------------
-    num_df = training_df[numerical_FN]
-    num_df.shape
-    
-    cat_df = training_df[categorical_FN]
-    cat_df.shape
-    
-    all_df = training_df[numerical_FN + categorical_FN]
-    all_df.shape
-    
-    #------------------------------
-    # WITH the target var included:
-        #'wtgt': with target
-    #------------------------------
-    # drug and dst_mode should be the same thing
-    num_df_wtgt = training_df[numerical_FN + ['dst_mode']]
-    num_df_wtgt.shape
-    
-    cat_df_wtgt = training_df[categorical_FN + ['dst_mode']]
-    cat_df_wtgt.shape
-    
-    all_df_wtgt = training_df[numerical_FN + categorical_FN + ['dst_mode']]
-    all_df_wtgt.shape
-    #%%########################################################################
-    #============
-    # ML data
-    #============
-    #------
-    # X: Training and Blind test (BTS)
-    #------
-    X     = all_df_wtgt[numerical_FN + categorical_FN] # training data ALL
-    X_bts = blind_test_df[numerical_FN + categorical_FN] # blind test data ALL
-    #X = all_df_wtgt[numerical_FN] # training numerical only
-    #X_bts = blind_test_df[numerical_FN] # blind test data numerical
-    
-    #------
-    # y
-    #------
-    y = all_df_wtgt['dst_mode'] # training data y
-    y_bts = blind_test_df['dst_mode'] # blind data test y
-    
-    #X_bts_wt = blind_test_df[numerical_FN + ['dst_mode']] 
-    
-    # Quick check
-    #(X['ligand_affinity_change']==0).sum() == (X['ligand_distance']>10).sum()
-    for i in range(len(cols_to_mask)):
-        ind = i+1
-        print('\nindex:', i, '\nind:', ind)
-        print('\nMask count check:'
-              , (my_df_ml[cols_to_mask[i]]==0).sum() == (my_df_ml['ligand_distance']>10).sum()
-              )
-    
-    print('Original Data\n', Counter(y)
-          , 'Data dim:', X.shape)
-    
-    yc1 = Counter(y)
-    yc1_ratio = yc1[0]/yc1[1]
-    
-    yc2 = Counter(y_bts)
-    yc2_ratio = yc2[0]/yc2[1]
-    
-    print('\n-------------------------------------------------------------'
-          , '\nSuccessfully split data: REVERSE training'
-          , '\nimputed values: training set'
-          , '\nactual values: blind test set'
-          , '\nTrain data size:', X.shape
-          , '\nTest data size:', X_bts.shape
-          , '\ny_train numbers:', yc1
-          , '\ny_train ratio:',yc1_ratio
-          , '\n'
-          , '\ny_test_numbers:', yc2
-          , '\ny_test ratio:', yc2_ratio
-          , '\n-------------------------------------------------------------'
-          )
-    ###########################################################################
-    #%% 
-    ###########################################################################
-    #                               RESAMPLING
-    ###########################################################################
-    #------------------------------
-    # Simple Random oversampling
-    # [Numerical + catgeorical]
-    #------------------------------
-    oversample = RandomOverSampler(sampling_strategy='minority')
-    X_ros, y_ros = oversample.fit_resample(X, y)
-    print('Simple Random OverSampling\n', Counter(y_ros))
-    print(X_ros.shape)
-    
-    #------------------------------
-    # Simple Random Undersampling
-    # [Numerical + catgeorical]
-    #------------------------------
-    undersample = RandomUnderSampler(sampling_strategy='majority')
-    X_rus, y_rus = undersample.fit_resample(X, y)
-    print('Simple Random UnderSampling\n', Counter(y_rus))
-    print(X_rus.shape)
-    
-    #------------------------------
-    # Simple combine ROS and RUS
-    # [Numerical + catgeorical]
-    #------------------------------
-    oversample = RandomOverSampler(sampling_strategy='minority')
-    X_ros, y_ros = oversample.fit_resample(X, y)
-    undersample = RandomUnderSampler(sampling_strategy='majority')
-    X_rouC, y_rouC = undersample.fit_resample(X_ros, y_ros)
-    print('Simple Combined Over and UnderSampling\n',  Counter(y_rouC))
-    print(X_rouC.shape)
-    
-    #------------------------------
-    # SMOTE_NC: oversampling 
-    # [numerical + categorical]
-    #https://stackoverflow.com/questions/47655813/oversampling-smote-for-binary-and-categorical-data-in-python
-    #------------------------------
-    # Determine categorical and numerical features
-    numerical_ix = X.select_dtypes(include=['int64', 'float64']).columns
-    numerical_ix
-    num_featuresL = list(numerical_ix)
-    numerical_colind = X.columns.get_indexer(list(numerical_ix) )
-    numerical_colind
-    
-    categorical_ix = X.select_dtypes(include=['object', 'bool']).columns
-    categorical_ix    
-    categorical_colind = X.columns.get_indexer(list(categorical_ix))
-    categorical_colind
-    
-    #k_sm = 5 #default, but this fails for gid as n_samples 3 [ONLY for reverse training]
-    if gene.lower() in geneL_na:
-        k_sm = 1
-    else:
-        k_sm = 5
-
-    sm_nc = SMOTENC(categorical_features=categorical_colind, k_neighbors = k_sm, **rs, **njobs)
-    X_smnc, y_smnc = sm_nc.fit_resample(X, y)
-    print('SMOTE_NC OverSampling\n', Counter(y_smnc))
-    print(X_smnc.shape)
-    globals().update(locals()) # TROLOLOLOLOLOLS
-    #print("i did a horrible hack :-)")
-    ###############################################################################
-    #%% SMOTE RESAMPLING for NUMERICAL ONLY*
-    # #------------------------------
-    # # SMOTE: Oversampling
-    # # [Numerical ONLY]
-    # #------------------------------
-    # k_sm = 1
-    # sm = SMOTE(sampling_strategy = 'auto', k_neighbors = k_sm, **rs)
-    # X_sm, y_sm = sm.fit_resample(X, y)
-    # print(X_sm.shape)
-    # print('SMOTE OverSampling\n', Counter(y_sm))
-    # y_sm_df = y_sm.to_frame()
-    # y_sm_df.value_counts().plot(kind = 'bar')
-    
-    # #------------------------------
-    # # SMOTE: Over + Undersampling COMBINED
-    # # [Numerical ONLY]
-    # #-----------------------------
-    # sm_enn = SMOTEENN(enn=EditedNearestNeighbours(sampling_strategy='all', **rs, **njobs ))
-    # X_enn, y_enn = sm_enn.fit_resample(X, y)
-    # print(X_enn.shape)
-    # print('SMOTE Over+Under Sampling combined\n', Counter(y_enn))
-    
-    ###############################################################################
-    # TODO: Find over and undersampling JUST for categorical data
--- a/scripts/ml/ml_data_sl.py
+++ b/scripts/ml/ml_data_sl.py
@ -1,811 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Sun Mar  6 13:41:54 2022
-
-@author: tanu
-"""
-def setvars(gene,drug):
-    #https://stackoverflow.com/questions/51695322/compare-multiple-algorithms-with-sklearn-pipeline
-    import os, sys
-    import pandas as pd
-    import numpy as np
-    print(np.__version__)
-    print(pd.__version__)
-    import pprint as pp
-    from copy import deepcopy
-    from collections import Counter
-    from sklearn.impute import KNNImputer as KNN
-    from imblearn.over_sampling import RandomOverSampler
-    from imblearn.under_sampling import RandomUnderSampler
-    from imblearn.over_sampling import SMOTE
-    from sklearn.datasets import make_classification
-    from imblearn.combine import SMOTEENN
-    from imblearn.combine import SMOTETomek
-    
-    from imblearn.over_sampling import SMOTENC
-    from imblearn.under_sampling import EditedNearestNeighbours
-    from imblearn.under_sampling import RepeatedEditedNearestNeighbours
-    
-    from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score
-    from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report
-    
-    from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
-    from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold
-    
-    from sklearn.pipeline import Pipeline, make_pipeline
-    import argparse
-    import re
-    #%% GLOBALS
-    tts_split = "sl"
-
-    rs = {'random_state': 42}
-    njobs = {'n_jobs': 10}
-    
-    scoring_fn =  ({ 'mcc'         : make_scorer(matthews_corrcoef)
-                    , 'accuracy'   : make_scorer(accuracy_score)
-                    , 'fscore'     : make_scorer(f1_score)
-                    , 'precision'  : make_scorer(precision_score)
-                    , 'recall'     : make_scorer(recall_score)
-                    , 'roc_auc'    : make_scorer(roc_auc_score)
-                    , 'jcc'        : make_scorer(jaccard_score)
-                }) 
-      
-    skf_cv = StratifiedKFold(n_splits = 10
-                              #, shuffle = False, random_state= None)
-                               , shuffle = True,**rs)
-    
-    rskf_cv = RepeatedStratifiedKFold(n_splits = 10
-                                      , n_repeats = 3
-                                      , **rs)
-    
-    mcc_score_fn  = {'mcc': make_scorer(matthews_corrcoef)}
-    jacc_score_fn = {'jcc': make_scorer(jaccard_score)}   
-    #%% FOR LATER: Combine ED logo data
-    ###########################################################################
-
-    homedir = os.path.expanduser("~")
-    
-    geneL_basic     = ['pnca']
-    geneL_na        = ['gid']
-    geneL_na_ppi2   = ['rpob']
-    geneL_ppi2      = ['alr', 'embb', 'katg']
-    
-    #num_type = ['int64', 'float64']
-    num_type = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
-    cat_type = ['object', 'bool']
-    
-    #==============
-    # directories
-    #==============
-    datadir = homedir + '/git/Data/'
-    indir   = datadir + drug + '/input/'
-    outdir  = datadir + drug + '/output/'
-    
-    #=======
-    # input
-    #=======
-    
-    #---------
-    # File 1
-    #---------
-    infile_ml1 = outdir + gene.lower() + '_merged_df3.csv' 
-    #infile_ml2 = outdir + gene.lower() + '_merged_df2.csv'
-    
-    my_features_df = pd.read_csv(infile_ml1, index_col = 0) 
-    my_features_df  = my_features_df .reset_index(drop = True)
-    my_features_df.index
-    
-    my_features_df.dtypes
-    mycols = my_features_df.columns
-    
-    #---------
-    # File 2
-    #---------
-    infile_aaindex = outdir + 'aa_index/' + gene.lower() + '_aa.csv' 
-    aaindex_df = pd.read_csv(infile_aaindex, index_col = 0) 
-    aaindex_df.dtypes
-    
-    #-----------
-    # check for non-numerical columns
-    #-----------
-    if any(aaindex_df.dtypes==object):
-        print('\naaindex_df contains non-numerical data')
-    
-    aaindex_df_object = aaindex_df.select_dtypes(include = cat_type)
-    print('\nTotal no. of non-numerial columns:', len(aaindex_df_object.columns))
-    
-    expected_aa_ncols = len(aaindex_df.columns) - len(aaindex_df_object.columns)
-    
-    #-----------
-    # Extract numerical data only
-    #-----------
-    print('\nSelecting numerical data only')
-    aaindex_df = aaindex_df.select_dtypes(include = num_type)
-    
-    #---------------------------
-    # aaindex: sanity check 1
-    #---------------------------
-    if len(aaindex_df.columns) == expected_aa_ncols:
-        print('\nPASS: successfully selected numerical columns only for aaindex_df')
-    else:
-        print('\nFAIL: Numbers mismatch'
-              , '\nExpected ncols:', expected_aa_ncols
-              , '\nGot:', len(aaindex_df.columns))    
-        
-    #---------------
-    # check for NA
-    #---------------
-    print('\nNow checking for NA in the remaining aaindex_cols')
-    c1 = aaindex_df.isna().sum()
-    c2 = c1.sort_values(ascending=False)
-    print('\nCounting aaindex_df cols with NA'
-          , '\nncols with NA:', sum(c2>0), 'columns'
-          , '\nDropping these...'
-          , '\nOriginal ncols:', len(aaindex_df.columns)
-          )
-    aa_df = aaindex_df.dropna(axis=1)
-    
-    print('\nRevised df ncols:', len(aa_df.columns))
-    
-    c3 = aa_df.isna().sum()
-    c4 = c3.sort_values(ascending=False)
-    
-    print('\nChecking NA in revised df...')
-    
-    if sum(c4>0):
-        sys.exit('\nFAIL: aaindex_df still contains cols with NA, please check and drop these before proceeding...')
-    else:
-        print('\nPASS: cols with NA successfully dropped from aaindex_df'
-              , '\nProceeding with combining aa_df with other features_df')
-        
-    #---------------------------
-    # aaindex: sanity check 2
-    #---------------------------
-    expected_aa_ncols2 =  len(aaindex_df.columns) - sum(c2>0)  
-    if len(aa_df.columns) == expected_aa_ncols2:
-        print('\nPASS: ncols match'
-              , '\nExpected ncols:', expected_aa_ncols2
-              , '\nGot:', len(aa_df.columns))
-    else:
-        print('\nFAIL: Numbers mismatch'
-              , '\nExpected ncols:', expected_aa_ncols2
-              , '\nGot:', len(aa_df.columns))            
-        
-    # Important: need this to identify aaindex cols    
-    aa_df_cols = aa_df.columns
-    print('\nTotal no. of columns in clean aa_df:', len(aa_df_cols))
-    
-    ###############################################################################
-    #%% Combining my_features_df and aaindex_df
-    #===========================
-    # Merge my_df + aaindex_df
-    #===========================
-    
-    if aa_df.columns[aa_df.columns.isin(my_features_df.columns)] == my_features_df.columns[my_features_df.columns.isin(aa_df.columns)]:
-        print('\nMerging on column: mutationinformation')   
-    
-    if len(my_features_df) == len(aa_df):
-        expected_nrows = len(my_features_df)
-        print('\nProceeding to merge, expected nrows in merged_df:', expected_nrows)
-    else:
-        sys.exit('\nNrows mismatch, cannot merge. Please check'
-              , '\nnrows my_df:', len(my_features_df)
-              , '\nnrows aa_df:', len(aa_df))
-               
-    #-----------------
-    # Reset index: mutationinformation
-    # Very important for merging
-    #-----------------
-    aa_df = aa_df.reset_index()
-    
-    expected_ncols = len(my_features_df.columns) + len(aa_df.columns) - 1 # for the no. of merging col
-    
-    #-----------------
-    # Merge: my_features_df + aa_df
-    #-----------------
-    merged_df = pd.merge(my_features_df
-                         , aa_df
-                         , on = 'mutationinformation')
-    
-    #---------------------------
-    # aaindex: sanity check 3
-    #---------------------------
-    if len(merged_df.columns) == expected_ncols:
-        print('\nPASS: my_features_df and aa_df successfully combined'
-              , '\nnrows:', len(merged_df)
-              , '\nncols:', len(merged_df.columns))
-    else:
-        sys.exit('\nFAIL: could not combine my_features_df and aa_df'
-                 , '\nCheck dims and merging cols!')
-        
-    #--------
-    # Reassign so downstream code doesn't need to change
-    #--------
-    my_df = merged_df.copy()
-    
-    #%% Data: my_df
-    # Check if non structural pos have crept in
-    # IDEALLY remove from source! But for rpoB do it here
-    # Drop NA where numerical cols have them
-    if gene.lower() in geneL_na_ppi2:
-        #D1148 get rid of
-        na_index = my_df['mutationinformation'].index[my_df['mcsm_na_affinity'].apply(np.isnan)]
-        my_df = my_df.drop(index=na_index)
-    
-    # FIXED: complete data for all muts inc L114M, F115L, V123L, V125I, V131M
-    # if gene.lower() in ['embb']:
-    #     na_index = my_df['mutationinformation'].index[my_df['ligand_distance'].apply(np.isnan)]
-    #     my_df = my_df.drop(index=na_index)
-    
-    # # Sanity check for non-structural positions
-    # print('\nChecking for non-structural postions')
-    # na_index = my_df['mutationinformation'].index[my_df['ligand_distance'].apply(np.isnan)]
-    # if len(na_index) > 0:
-    #     print('\nNon-structural positions detected for gene:', gene.lower()
-    #           , '\nTotal number of these detected:', len(na_index)
-    #           , '\These are at index:', na_index
-    #           , '\nOriginal nrows:', len(my_df)
-    #           , '\nDropping these...')
-    #     my_df = my_df.drop(index=na_index)
-    #     print('\nRevised nrows:', len(my_df))
-    # else:
-    #     print('\nNo non-structural positions detected for gene:', gene.lower()
-    #           , '\nnrows:', len(my_df))
-              
-    
-    ###########################################################################
-    #%% Add lineage calculation columns
-    #FIXME: Check if this can be imported from config?
-    total_mtblineage_uc = 8
-    lineage_colnames = ['lineage_list_all', 'lineage_count_all', 'lineage_count_unique', 'lineage_list_unique', 'lineage_multimode']
-    #bar = my_df[lineage_colnames]
-    my_df['lineage_proportion']      = my_df['lineage_count_unique']/my_df['lineage_count_all']
-    my_df['dist_lineage_proportion'] = my_df['lineage_count_unique']/total_mtblineage_uc
-    ###########################################################################
-    #%% Active site annotation column
-    # change from numberic to categorical
-    
-    if my_df['active_site'].dtype in num_type:
-        my_df['active_site'] = my_df['active_site'].astype(object)
-        my_df['active_site'].dtype
-    #%% AA property change
-    #--------------------
-    # Water prop change
-    #--------------------
-    my_df['water_change'] = my_df['wt_prop_water'] + str('_to_') + my_df['mut_prop_water']
-    my_df['water_change'].value_counts()
-    
-    water_prop_changeD = {
-        'hydrophobic_to_neutral'          : 'change'
-        , 'hydrophobic_to_hydrophobic'    : 'no_change'
-        , 'neutral_to_neutral'            : 'no_change'
-        , 'neutral_to_hydrophobic'        : 'change'
-        , 'hydrophobic_to_hydrophilic'    : 'change'
-        , 'neutral_to_hydrophilic'        : 'change'
-        , 'hydrophilic_to_neutral'        : 'change'
-        , 'hydrophilic_to_hydrophobic'    : 'change'
-        , 'hydrophilic_to_hydrophilic'    : 'no_change'
-    }
-    
-    my_df['water_change'] = my_df['water_change'].map(water_prop_changeD)
-    my_df['water_change'].value_counts()
-    
-    #--------------------
-    # Polarity change
-    #--------------------
-    my_df['polarity_change'] = my_df['wt_prop_polarity'] + str('_to_') + my_df['mut_prop_polarity']
-    my_df['polarity_change'].value_counts()
-    
-    polarity_prop_changeD = {
-        'non-polar_to_non-polar'     : 'no_change'
-        , 'non-polar_to_neutral'     : 'change'  
-        , 'neutral_to_non-polar'     : 'change'  
-        , 'neutral_to_neutral'       : 'no_change'  
-        , 'non-polar_to_basic'       : 'change'  
-        , 'acidic_to_neutral'        : 'change'  
-        , 'basic_to_neutral'         : 'change'  
-        , 'non-polar_to_acidic'      : 'change'  
-        , 'neutral_to_basic'         : 'change'  
-        , 'acidic_to_non-polar'      : 'change'  
-        , 'basic_to_non-polar'       : 'change'
-        , 'neutral_to_acidic'        : 'change'
-        , 'acidic_to_acidic'         : 'no_change'
-        , 'basic_to_acidic'          : 'change'
-        , 'basic_to_basic'           : 'no_change'
-        , 'acidic_to_basic'          : 'change'}
-    
-    my_df['polarity_change'] = my_df['polarity_change'].map(polarity_prop_changeD)
-    my_df['polarity_change'].value_counts()
-    
-    #--------------------
-    # Electrostatics change
-    #--------------------
-    my_df['electrostatics_change'] = my_df['wt_calcprop'] + str('_to_') + my_df['mut_calcprop']
-    my_df['electrostatics_change'].value_counts()
-    
-    calc_prop_changeD = {
-            'non-polar_to_non-polar'     : 'no_change'
-            , 'non-polar_to_polar'       : 'change'
-            , 'polar_to_non-polar'       : 'change'
-            , 'non-polar_to_pos'         : 'change'
-            , 'neg_to_non-polar'         : 'change'
-            , 'non-polar_to_neg'         : 'change'
-            , 'pos_to_polar'             : 'change'
-            , 'pos_to_non-polar'         : 'change'
-            , 'polar_to_polar'           : 'no_change'
-            , 'neg_to_neg'               : 'no_change'
-            , 'polar_to_neg'             : 'change'
-            , 'pos_to_neg'               : 'change'
-            , 'pos_to_pos'               : 'no_change'
-            , 'polar_to_pos'             : 'change'
-            , 'neg_to_polar'             : 'change'
-            , 'neg_to_pos'               : 'change'
-    }
-    
-    my_df['electrostatics_change'] = my_df['electrostatics_change'].map(calc_prop_changeD)
-    my_df['electrostatics_change'].value_counts()
-    
-    #--------------------    
-    # Summary change: Create a combined column summarising these three cols
-    #--------------------
-    detect_change = 'change'
-    check_prop_cols = ['water_change', 'polarity_change', 'electrostatics_change']
-    #my_df['aa_prop_change'] = (my_df.values == detect_change).any(1).astype(int)
-    my_df['aa_prop_change'] = (my_df[check_prop_cols].values == detect_change).any(1).astype(int)
-    my_df['aa_prop_change'].value_counts()
-    my_df['aa_prop_change'].dtype
-    
-    my_df['aa_prop_change'] = my_df['aa_prop_change'].map({1:'change'
-                                                           , 0: 'no_change'})
-    
-    my_df['aa_prop_change'].value_counts()
-    my_df['aa_prop_change'].dtype
-    
-    #%% IMPUTE values for OR [check script for exploration: UQ_or_imputer]
-    #--------------------
-    # Impute OR values
-    #--------------------
-    #or_cols = ['or_mychisq', 'log10_or_mychisq', 'or_fisher']
-    sel_cols = ['mutationinformation', 'or_mychisq', 'log10_or_mychisq']
-    or_cols = ['or_mychisq', 'log10_or_mychisq']
-    
-    print("count of NULL values before imputation\n")
-    print(my_df[or_cols].isnull().sum())
-    
-    my_dfI = pd.DataFrame(index = my_df['mutationinformation'] )
-    
-        
-    my_dfI = pd.DataFrame(KNN(n_neighbors=3, weights="uniform").fit_transform(my_df[or_cols])
-                          , index =  my_df['mutationinformation']
-                          , columns = or_cols )
-    my_dfI.columns = ['or_rawI', 'logorI']
-    my_dfI.columns
-    my_dfI = my_dfI.reset_index(drop = False) # prevents old index from being added as a column
-    my_dfI.head()
-    print("count of NULL values AFTER imputation\n")
-    print(my_dfI.isnull().sum())
-    
-    #-------------------------------------------
-    # OR df Merge: with original based on index
-    #-------------------------------------------
-    #my_df['index_bm'] = my_df.index
-    mydf_imputed = pd.merge(my_df
-                        , my_dfI
-                        , on = 'mutationinformation')
-    #mydf_imputed = mydf_imputed.set_index(['index_bm'])
-    
-    my_df['log10_or_mychisq'].isna().sum()
-    mydf_imputed['log10_or_mychisq'].isna().sum()
-    mydf_imputed['logorI'].isna().sum() # should be 0
-    
-    len(my_df.columns)
-    len(mydf_imputed.columns)  
-    
-    #-----------------------------------------
-    # REASSIGN my_df after imputing OR values
-    #-----------------------------------------
-    my_df = mydf_imputed.copy()
-    
-    if my_df['logorI'].isna().sum() == 0:
-        print('\nPASS: OR values imputed, data ready for ML')
-    else:
-        sys.exit('\nFAIL: something went wrong, Data not ready for ML. Please check upstream!')
-    
-    #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-    #---------------------------------------
-    # TODO: try other imputation like MICE
-    #---------------------------------------
-    #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-    
-    #%%########################################################################
-    #==========================
-    #     Data for ML
-    #==========================
-    my_df_ml = my_df.copy()
-    
-    # Build column names to mask for affinity chanhes
-    if gene.lower() in geneL_basic:
-        #X_stabilityN = common_cols_stabiltyN
-        gene_affinity_colnames = []# not needed as its the common ones 
-        cols_to_mask = ['ligand_affinity_change']
-        
-    if gene.lower() in geneL_ppi2:
-        gene_affinity_colnames = ['mcsm_ppi2_affinity', 'interface_dist'] 
-        #X_stabilityN = common_cols_stabiltyN + geneL_ppi2_st_cols
-        cols_to_mask = ['ligand_affinity_change', 'mcsm_ppi2_affinity']
-    
-    if gene.lower() in geneL_na:
-        gene_affinity_colnames =  ['mcsm_na_affinity'] 
-        #X_stabilityN = common_cols_stabiltyN + geneL_na_st_cols
-        cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity']
-    
-    if gene.lower() in geneL_na_ppi2:
-        gene_affinity_colnames = ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] 
-        #X_stabilityN = common_cols_stabiltyN + geneL_na_ppi2_st_cols
-        cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity', 'mcsm_ppi2_affinity']
-    
-    #=======================
-    # Masking columns:
-    # (mCSM-lig, mCSM-NA, mCSM-ppi2) values for lig_dist >10
-    #=======================
-    my_df_ml['mutationinformation'][my_df_ml['ligand_distance']>10].value_counts()
-    my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts()
-    my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask].value_counts()
-    
-    # mask the mcsm affinity related columns where ligand distance > 10
-    my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask] = 0
-    (my_df_ml['ligand_affinity_change'] == 0).sum()
-    
-    mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask]  
-    
-    #===================================================
-    # write file for check
-    mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
-    mask_check.to_csv(outdir + 'ml/' + gene.lower() + '_mask_check.csv')
-    #===================================================
-    ###############################################################################
-    #%% Feature groups (FG): Build X for Input ML 
-    ############################################################################
-    #===========================
-    # FG1: Evolutionary features
-    #===========================
-    X_evolFN =  ['consurf_score'
-               , 'snap2_score'
-               , 'provean_score']
-    
-    ###############################################################################
-    #========================
-    # FG2: Stability features
-    #========================
-    #--------
-    # common
-    #--------
-    X_common_stability_Fnum = [
-               'duet_stability_change'
-               , 'ddg_foldx'
-               , 'deepddg'
-               , 'ddg_dynamut2'
-               , 'contacts']
-    #--------
-    # FoldX
-    #--------
-    X_foldX_Fnum = [ 'electro_rr', 'electro_mm', 'electro_sm', 'electro_ss'
-    , 'disulfide_rr', 'disulfide_mm', 'disulfide_sm', 'disulfide_ss'
-    , 'hbonds_rr', 'hbonds_mm', 'hbonds_sm', 'hbonds_ss'
-    , 'partcov_rr', 'partcov_mm', 'partcov_sm', 'partcov_ss'
-    , 'vdwclashes_rr', 'vdwclashes_mm', 'vdwclashes_sm', 'vdwclashes_ss'
-    , 'volumetric_rr', 'volumetric_mm', 'volumetric_ss']
-    
-    X_stability_FN = X_common_stability_Fnum + X_foldX_Fnum
-    
-    ###############################################################################
-    #===================
-    # FG3: Affinity features
-    #===================
-    common_affinity_Fnum =  ['ligand_distance'
-                    , 'ligand_affinity_change'
-                    , 'mmcsm_lig']
-    
-    # if gene.lower() in geneL_basic:
-    #     X_affinityFN = common_affinity_Fnum 
-    # else:
-    #     X_affinityFN = common_affinity_Fnum + gene_affinity_colnames
-        
-    X_affinityFN = common_affinity_Fnum + gene_affinity_colnames
-    
-    ###############################################################################
-    #============================
-    # FG4: Residue level features
-    #============================
-    #-----------
-    # AA index
-    #-----------
-    X_aaindex_Fnum = list(aa_df_cols)
-    print('\nTotal no. of features for aaindex:', len(X_aaindex_Fnum))
-    
-    #-----------------
-    # surface area
-    # depth
-    # hydrophobicity
-    #-----------------
-    X_str_Fnum =  ['rsa'
-               #, 'asa'
-               , 'kd_values'
-               , 'rd_values']   
-    
-    #---------------------------
-    # Other aa properties
-    # active site indication
-    #---------------------------
-    X_aap_Fcat = ['ss_class'
-                # , 'wt_prop_water'
-                # , 'mut_prop_water'
-                # , 'wt_prop_polarity'
-                # , 'mut_prop_polarity'
-                # , 'wt_calcprop'
-                # , 'mut_calcprop'
-                , 'aa_prop_change'
-                , 'electrostatics_change'
-                , 'polarity_change'
-                , 'water_change'
-                , 'active_site']
-       
-    X_resprop_FN = X_aaindex_Fnum + X_str_Fnum + X_aap_Fcat
-    ###############################################################################
-    #========================
-    # FG5: Genomic features
-    #========================
-    X_gn_mafor_Fnum =  ['maf'
-                    #, 'logorI'
-                    # , 'or_rawI'
-                    # , 'or_mychisq'
-                    # , 'or_logistic'
-                    # , 'or_fisher'
-                    # , 'pval_fisher'
-                    ]
-    
-    X_gn_linegae_Fnum  = ['lineage_proportion'
-                          , 'dist_lineage_proportion'
-                          #, 'lineage' # could be included as a category but it has L2;L4  formatting
-                          , 'lineage_count_all'
-                          , 'lineage_count_unique'
-                          ]
-    
-    # X_gn_Fcat = ['drtype_mode_labels'  # beware then you can't use it to predict [USED it for uq_v1, not v2]
-    #                #, 'gene_name' # will be required for the combined stuff
-    #              ]
-    X_gn_Fcat = []
-    
-    X_genomicFN = X_gn_mafor_Fnum + X_gn_linegae_Fnum + X_gn_Fcat
-    ###############################################################################
-    #========================
-    # FG6 collapsed: Structural : Atability + Affinity + ResidueProp
-    #========================
-    X_structural_FN =  X_stability_FN + X_affinityFN + X_resprop_FN
-    
-    ###############################################################################
-    #========================
-    # BUILDING all features
-    #========================
-    all_featuresN = X_evolFN + X_structural_FN + X_genomicFN
-    
-    ###############################################################################
-    #%% Define training and test data
-    #================================================================
-    # Training and BLIND test set: scaling law split
-    #https://towardsdatascience.com/finally-why-we-use-an-80-20-split-for-training-and-test-data-plus-an-alternative-method-oh-yes-edc77e96295d
-    # dst with actual values  : training set
-    # dst with imputed values : THROW AWAY [unrepresentative]
-    # test data size ~ 1/sqrt(features NOT including target variable)
-    #================================================================
-    my_df_ml[drug].isna().sum()
-    
-    #    blind_test_df = my_df_ml[my_df_ml[drug].isna()]
-    #    blind_test_df.shape
-    
-    training_df = my_df_ml[my_df_ml[drug].notna()]
-    training_df.shape
-    
-    # Target 1: dst_mode
-    training_df[drug].value_counts()
-    training_df['dst_mode'].value_counts()
-    
-    ####################################################################
-    #====================================
-    # ML data: Train test split: SL
-    # with stratification
-    # 1-blind test : training_data for CV
-    # 1/sqrt(columns) : blind test 
-    #===========================================
-    x_features = training_df[all_featuresN]
-    y_target   = training_df['dst_mode']
-    
-    # sanity check
-    if not 'dst_mode' in x_features.columns:
-        print('\nPASS: x_features has no target variable')
-        x_ncols = len(x_features.columns)
-        print('\nNo. of columns for x_features:', x_ncols)
-        # NEED It for scaling law split
-        #https://towardsdatascience.com/finally-why-we-use-an-80-20-split-for-training-and-test-data-plus-an-alternative-method-oh-yes-edc77e96295d
-    else:
-        sys.exit('\nFAIL: x_features has target variable included. FIX it and rerun!')
-    #-------------------
-    # train-test split
-    #-------------------
-    sl_test_size = 1/np.sqrt(x_ncols)
-    train = 1 - sl_test_size
-
-    #x_train, x_test, y_train, y_test # traditional var_names
-    # so my downstream code doesn't need to change    
-    X, X_bts, y, y_bts = train_test_split(x_features, y_target
-                                                    , test_size = sl_test_size
-                                                    , **rs
-                                                    , stratify = y_target)
-    yc1 = Counter(y)
-    yc1_ratio = yc1[0]/yc1[1]
-    
-    yc2 = Counter(y_bts)
-    yc2_ratio = yc2[0]/yc2[1]
-    
-    ###############################################################################
-    #======================================================
-    # Determine categorical and numerical features
-    #======================================================
-    numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
-    numerical_cols 
-    categorical_cols = X.select_dtypes(include=['object', 'bool']).columns
-    categorical_cols 
-    
-    ################################################################################
-    # IMPORTANT sanity checks
-    if len(X.columns) == len(X_evolFN) + len(X_stability_FN) + len(X_affinityFN) + len(X_resprop_FN) + len(X_genomicFN):
-        print('\nPASS: ML data with input features, training and test generated...'
-              , '\n\nTotal no. of input features:'        , len(X.columns)
-              , '\n--------No. of numerical features:'    , len(numerical_cols)
-              , '\n--------No. of categorical features:'  , len(categorical_cols)
-              
-              , '\n\nTotal no. of evolutionary features:' , len(X_evolFN)
-              
-              , '\n\nTotal no. of stability features:'    , len(X_stability_FN)
-              , '\n--------Common stabilty cols:'         , len(X_common_stability_Fnum)
-              , '\n--------Foldx cols:'                   , len(X_foldX_Fnum)
-              
-              , '\n\nTotal no. of affinity features:'     , len(X_affinityFN)
-              , '\n--------Common affinity cols:'         , len(common_affinity_Fnum)
-              , '\n--------Gene specific affinity cols:'  , len(gene_affinity_colnames)
-              
-              , '\n\nTotal no. of residue level features:', len(X_resprop_FN)
-              , '\n--------AA index cols:'                , len(X_aaindex_Fnum)
-              , '\n--------Residue Prop cols:'            , len(X_str_Fnum)
-              , '\n--------AA change Prop cols:'          , len(X_aap_Fcat)
-              
-              , '\n\nTotal no. of genomic features:'      , len(X_genomicFN)
-              , '\n--------MAF+OR cols:'                  , len(X_gn_mafor_Fnum)
-              , '\n--------Lineage cols:'                 , len(X_gn_linegae_Fnum)
-              , '\n--------Other cols:'                   , len(X_gn_Fcat)
-              )
-    else:
-        print('\nFAIL: numbers mismatch'
-              , '\nExpected:',len(X_evolFN) + len(X_stability_FN) + len(X_affinityFN) + len(X_resprop_FN) + len(X_genomicFN)
-              , '\nGot:', len(X.columns))
-        sys.exit()
-    ###############################################################################
-    print('\n-------------------------------------------------------------'
-          , '\nSuccessfully split data: ALL features'
-          , '\nactual values: training set'
-          ,  '\nSplit:', tts_split
-          #, '\nimputed values: blind test set'
-          
-          , '\n\nTotal data size:', len(X) + len(X_bts)
-    
-          , '\n\nTrain data size:', X.shape
-          , '\ny_train numbers:', yc1
-    
-          , '\n\nTest data size:', X_bts.shape
-          , '\ny_test_numbers:', yc2
-    
-          , '\n\ny_train ratio:',yc1_ratio
-          , '\ny_test ratio:', yc2_ratio
-          , '\n-------------------------------------------------------------'
-          )
-    ##########################################################################    
-    # Quick check
-    #(X['ligand_affinity_change']==0).sum() == (X['ligand_distance']>10).sum()
-    for i in range(len(cols_to_mask)):
-        ind = i+1
-        print('\nindex:', i, '\nind:', ind)
-        print('\nMask count check:'
-              , (my_df_ml[cols_to_mask[i]]==0).sum() == (my_df_ml['ligand_distance']>10).sum()
-              )
-    
-    print('Original Data\n', Counter(y)
-          , 'Data dim:', X.shape)
-    ###########################################################################
-    #%% 
-    ###########################################################################
-    #                               RESAMPLING
-    ###########################################################################
-    #------------------------------
-    # Simple Random oversampling
-    # [Numerical + catgeorical]
-    #------------------------------
-    oversample = RandomOverSampler(sampling_strategy='minority')
-    X_ros, y_ros = oversample.fit_resample(X, y)
-    print('\nSimple Random OverSampling\n', Counter(y_ros))
-    print(X_ros.shape)
-    
-    #------------------------------
-    # Simple Random Undersampling
-    # [Numerical + catgeorical]
-    #------------------------------
-    undersample = RandomUnderSampler(sampling_strategy='majority')
-    X_rus, y_rus = undersample.fit_resample(X, y)
-    print('\nSimple Random UnderSampling\n', Counter(y_rus))
-    print(X_rus.shape)
-    
-    #------------------------------
-    # Simple combine ROS and RUS
-    # [Numerical + catgeorical]
-    #------------------------------
-    oversample = RandomOverSampler(sampling_strategy='minority')
-    X_ros, y_ros = oversample.fit_resample(X, y)
-    undersample = RandomUnderSampler(sampling_strategy='majority')
-    X_rouC, y_rouC = undersample.fit_resample(X_ros, y_ros)
-    print('\nSimple Combined Over and UnderSampling\n',  Counter(y_rouC))
-    print(X_rouC.shape)
-    
-    #------------------------------
-    # SMOTE_NC: oversampling 
-    # [numerical + categorical]
-    #https://stackoverflow.com/questions/47655813/oversampling-smote-for-binary-and-categorical-data-in-python
-    #------------------------------
-    # Determine categorical and numerical features
-    numerical_ix = X.select_dtypes(include=['int64', 'float64']).columns
-    numerical_ix
-    num_featuresL = list(numerical_ix)
-    numerical_colind = X.columns.get_indexer(list(numerical_ix) )
-    numerical_colind
-    
-    categorical_ix = X.select_dtypes(include=['object', 'bool']).columns
-    categorical_ix    
-    categorical_colind = X.columns.get_indexer(list(categorical_ix))
-    categorical_colind
-    
-    k_sm = 5 # 5 is default
-    sm_nc = SMOTENC(categorical_features=categorical_colind, k_neighbors = k_sm, **rs, **njobs)
-    X_smnc, y_smnc = sm_nc.fit_resample(X, y)
-    print('\nSMOTE_NC OverSampling\n', Counter(y_smnc))
-    print(X_smnc.shape)
-    globals().update(locals()) # TROLOLOLOLOLOLS
-    #print("i did a horrible hack :-)")
-    ###############################################################################
-    #%% SMOTE RESAMPLING for NUMERICAL ONLY*
-    # #------------------------------
-    # # SMOTE: Oversampling
-    # # [Numerical ONLY]
-    # #------------------------------
-    # k_sm = 1
-    # sm = SMOTE(sampling_strategy = 'auto', k_neighbors = k_sm, **rs)
-    # X_sm, y_sm = sm.fit_resample(X, y)
-    # print(X_sm.shape)
-    # print('\nSMOTE OverSampling\n', Counter(y_sm))
-    # y_sm_df = y_sm.to_frame()
-    # y_sm_df.value_counts().plot(kind = 'bar')
-    
-    # #------------------------------
-    # # SMOTE: Over + Undersampling COMBINED
-    # # [Numerical ONLY]
-    # #-----------------------------
-    # sm_enn = SMOTEENN(enn=EditedNearestNeighbours(sampling_strategy='all', **rs, **njobs ))
-    # X_enn, y_enn = sm_enn.fit_resample(X, y)
-    # print(X_enn.shape)
-    # print('\nSMOTE Over+Under Sampling combined\n', Counter(y_enn))
-    
-    ###############################################################################
-    # TODO: Find over and undersampling JUST for categorical data
-        ###########################################################################
-    
-    print('\n#################################################################'
-          , '\nDim of X for gene:', gene.lower(), '\n',  X.shape
-          , '\n###############################################################')
--- a/scripts/ml/ml_functions/FS.py
+++ b/scripts/ml/ml_functions/FS.py
@ -4,7 +4,8 @@
 Created on Mon May 23 23:25:26 2022

@author: tanu
-""" 
+"""
+#%%
 import os, sys
 import pandas as pd
 import numpy as np
@ -389,4 +390,5 @@ def fsgs_rfecv(input_df
              , '\nOutput dict size:', len(output_modelD))
        return(output_modelD)
    else:
-        sys.exit('\nFAIL:numbers mismatch output dict length not as expected. Please check')
+        sys.exit('\nFAIL:numbers mismatch output dict length not as expected. Please check')
+    
--- a/scripts/ml/ml_functions/GetMLData.py
+++ b/scripts/ml/ml_functions/GetMLData.py
@ -37,6 +37,7 @@ from sklearn.pipeline import Pipeline, make_pipeline
 import argparse
 import re

+
 def getmldata(gene, drug
              , data_combined_model = False
              , use_or = False
--- a/scripts/ml/ml_functions/MultClfs.py
+++ b/scripts/ml/ml_functions/MultClfs.py
@ -0,0 +1,533 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Mar  4 15:25:33 2022
+
+@author: tanu
+"""
+#%%
+import os, sys
+import pandas as pd
+import numpy as np
+import pprint as pp
+from copy import deepcopy
+from sklearn import linear_model
+from sklearn import datasets
+from collections import Counter
+
+from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
+from sklearn.linear_model import RidgeClassifier, RidgeClassifierCV, SGDClassifier, PassiveAggressiveClassifier
+
+from sklearn.naive_bayes import BernoulliNB
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.svm import SVC
+from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
+from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier
+from sklearn.naive_bayes import GaussianNB
+from sklearn.gaussian_process import GaussianProcessClassifier, kernels
+from sklearn.gaussian_process.kernels import RBF, DotProduct, Matern, RationalQuadratic, WhiteKernel
+
+from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
+from sklearn.neural_network import MLPClassifier
+
+from sklearn.svm import SVC
+from xgboost import XGBClassifier
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
+
+from sklearn.compose import ColumnTransformer
+from sklearn.compose import make_column_transformer
+
+from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score
+from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report
+
+# added
+from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, LeaveOneOut, KFold, RepeatedKFold, cross_val_predict
+
+from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
+from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold
+
+from sklearn.pipeline import Pipeline, make_pipeline
+
+from sklearn.feature_selection import RFE, RFECV
+
+import itertools
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+from statistics import mean, stdev, median, mode
+
+from imblearn.over_sampling import RandomOverSampler
+from imblearn.under_sampling import RandomUnderSampler
+from imblearn.over_sampling import SMOTE
+from sklearn.datasets import make_classification
+from imblearn.combine import SMOTEENN
+from imblearn.combine import SMOTETomek
+
+from imblearn.over_sampling import SMOTENC
+from imblearn.under_sampling import EditedNearestNeighbours
+from imblearn.under_sampling import RepeatedEditedNearestNeighbours
+
+from sklearn.model_selection import GridSearchCV
+from sklearn.base import BaseEstimator
+from sklearn.impute import KNNImputer as KNN
+import json
+import argparse
+import re
+#%% GLOBALS
+rs = {'random_state': 42}
+njobs = {'n_jobs': 10}
+
+scoring_fn =  ({ 'mcc'        : make_scorer(matthews_corrcoef)
+                , 'fscore'    : make_scorer(f1_score)
+                , 'precision' : make_scorer(precision_score)
+                , 'recall'    : make_scorer(recall_score)
+                , 'accuracy'  : make_scorer(accuracy_score)
+                , 'roc_auc'   : make_scorer(roc_auc_score)
+                , 'jcc'       : make_scorer(jaccard_score)
+            }) 
+  
+skf_cv = StratifiedKFold(n_splits = 10
+                          #, shuffle = False, random_state= None)
+                           , shuffle = True,**rs)
+
+rskf_cv = RepeatedStratifiedKFold(n_splits = 10
+                                  , n_repeats = 3
+                                  , **rs)
+
+mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)}
+jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
+
+###############################################################################
+score_type_ordermapD = { 'mcc'      : 1
+                   , 'fscore'       : 2
+                   , 'jcc'          : 3
+                   , 'precision'    : 4
+                   , 'recall'       : 5      
+                   , 'accuracy'     : 6  
+                   , 'roc_auc'      : 7
+                   , 'TN'           : 8
+                   , 'FP'           : 9
+                   , 'FN'           : 10
+                   , 'TP'           : 11  
+                   , 'trainingY_neg': 12  
+                   , 'trainingY_pos': 13    
+                   , 'blindY_neg'   : 14
+                   , 'blindY_pos'   : 15
+                   , 'fit_time'     : 16
+                   , 'score_time'   : 17
+                   }
+
+scoreCV_mapD = {'test_mcc'         : 'MCC'
+                , 'test_fscore'    : 'F1'
+                , 'test_precision' : 'Precision'
+                , 'test_recall'    : 'Recall'
+                , 'test_accuracy'  : 'Accuracy'
+                , 'test_roc_auc'   : 'ROC_AUC'
+                , 'test_jcc'       : 'JCC'
+                }
+
+scoreBT_mapD = {'bts_mcc'          : 'MCC'
+                , 'bts_fscore'     : 'F1'
+                , 'bts_precision'  : 'Precision'
+                , 'bts_recall'     : 'Recall'
+                , 'bts_accuracy'   : 'Accuracy'
+                , 'bts_roc_auc'    : 'ROC_AUC'
+                , 'bts_jcc'        : 'JCC'
+               }
+
+#%%############################################################################
+############################
+# MultModelsCl()
+# Run Multiple Classifiers
+############################
+# Multiple Classification - Model Pipeline
+def MultModelsCl(input_df, target, skf_cv
+                       , blind_test_df
+                       , blind_test_target
+                       , tts_split_type 
+                       , run_blind_test = True
+
+                       , resampling_type = 'none' # default
+                       , add_cm = True # adds confusion matrix based on cross_val_predict
+                       , add_yn = True  # adds target var class numbers
+                       , var_type = ['numerical', 'categorical','mixed']
+                       , return_formatted_output = True):
+
+    '''
+    @ param input_df: input features 
+    @ type: df with input features WITHOUT the target variable
+    
+    @param target: target (or output) feature
+    @type: df or np.array or Series
+    
+    @param skv_cv: stratifiedK fold int or object to allow shuffle and random state to pass
+    @type: int or StratifiedKfold()
+    
+    @var_type: numerical, categorical and mixed to determine what col_transform to apply (MinMaxScalar and/or one-ho    t encoder)
+    @type: list
+
+    returns
+    Dict containing multiple classification scores for each model and mean of each Stratified Kfold including training
+    '''
+
+    #======================================================
+    # Determine categorical and numerical features
+    #======================================================
+    numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
+    numerical_ix
+    categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns
+    categorical_ix    
+
+    #======================================================
+    # Determine preprocessing steps ~ var_type
+    #======================================================
+    if var_type == 'numerical':
+        t = [('num', MinMaxScaler(), numerical_ix)]
+
+    if var_type == 'categorical':
+        t = [('cat', OneHotEncoder(), categorical_ix)]
+    
+    if var_type == 'mixed':
+        t = [('num', MinMaxScaler(), numerical_ix)
+            , ('cat', OneHotEncoder(), categorical_ix) ]
+        
+    col_transform = ColumnTransformer(transformers = t
+                                       , remainder='passthrough')
+    
+    #======================================================
+    # Specify multiple Classification Models  
+    #======================================================
+    models = [('AdaBoost Classifier'     , AdaBoostClassifier(**rs) )
+               , ('Bagging Classifier'        , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True) )
+               , ('Decision Tree'             , DecisionTreeClassifier(**rs) ) 
+               , ('Extra Tree'                , ExtraTreeClassifier(**rs) )
+               , ('Extra Trees'               , ExtraTreesClassifier(**rs) ) 
+               , ('Gradient Boosting'         , GradientBoostingClassifier(**rs) )
+               , ('Gaussian NB'               , GaussianNB() )
+               , ('Gaussian Process'          , GaussianProcessClassifier(**rs) )
+               , ('K-Nearest Neighbors'       , KNeighborsClassifier() ) 
+               , ('LDA'                       , LinearDiscriminantAnalysis() )
+               , ('Logistic Regression'       , LogisticRegression(**rs) )
+               , ('Logistic RegressionCV'     , LogisticRegressionCV(cv = 3, **rs))
+               , ('MLP'                       , MLPClassifier(max_iter = 500, **rs) ) 
+               , ('Multinomial'               , MultinomialNB() )
+               , ('Naive Bayes'               , BernoulliNB() )
+               , ('Passive Aggresive'         , PassiveAggressiveClassifier(**rs, **njobs) )
+               , ('QDA'                       , QuadraticDiscriminantAnalysis() )
+               , ('Random Forest'             , RandomForestClassifier(**rs, n_estimators = 1000 ) ) 
+               , ('Random Forest2'            , RandomForestClassifier(min_samples_leaf = 5
+                                                                       , n_estimators     = 1000
+                                                                       , bootstrap        = True
+                                                                       , oob_score        = True
+                                                                       , **njobs
+                                                                       , **rs
+                                                                       , max_features     = 'auto') ) 
+                , ('Ridge Classifier'          , RidgeClassifier(**rs)  )
+                , ('Ridge ClassifierCV'        , RidgeClassifierCV(cv = 3) )          
+                , ('SVC'                       , SVC(**rs) ) 
+                , ('Stochastic GDescent'       , SGDClassifier(**rs, **njobs) )
+                , ('XGBoost'                   , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False) )
+             ]
+                
+    mm_skf_scoresD = {}
+    
+    print('\n==============================================================\n'
+          , '\nRunning several classification models (n):', len(models)
+          ,'\nList of models:')
+    for m in models:
+        print(m)
+    print('\n================================================================\n')
+    
+    index = 1
+    for model_name, model_fn in models:
+        print('\nRunning classifier:', index
+              , '\nModel_name:'               , model_name
+              , '\nModel func:'               , model_fn)
+        index = index+1
+        
+        model_pipeline = Pipeline([
+            ('prep'     , col_transform)
+            , ('model'  , model_fn)])
+            
+        print('\nRunning model pipeline:', model_pipeline)
+        skf_cv_modD = cross_validate(model_pipeline
+                              , input_df
+                              , target
+                              , cv = skf_cv
+                              , scoring = scoring_fn
+                              , return_train_score = True)
+        #==============================
+        # Extract mean values for CV 
+        #==============================
+        mm_skf_scoresD[model_name] = {}
+        
+        for key, value in skf_cv_modD.items():
+            print('\nkey:', key, '\nvalue:', value)
+            print('\nmean value:', np.mean(value))
+            mm_skf_scoresD[model_name][key] = round(np.mean(value),2)
+            
+        # ADD more info: meta data related to input df
+        mm_skf_scoresD[model_name]['resampling']        = resampling_type
+        mm_skf_scoresD[model_name]['n_training_size']   = len(input_df)
+        mm_skf_scoresD[model_name]['n_trainingY_ratio'] = round(Counter(target)[0]/Counter(target)[1], 2)
+        mm_skf_scoresD[model_name]['n_features']        = len(input_df.columns)
+        mm_skf_scoresD[model_name]['tts_split']         = tts_split_type
+        
+        #######################################################################
+        #======================================================
+        # Option: Add confusion matrix from cross_val_predict
+        # Understand and USE with caution
+        #======================================================
+        if add_cm:  
+           cmD = {}
+
+            # Calculate cm         
+           y_pred   = cross_val_predict(model_pipeline, input_df, target, cv = skf_cv, **njobs)
+            #_tn, _fp, _fn, _tp = confusion_matrix(y_pred, y).ravel() # internally
+           tn, fp, fn, tp = confusion_matrix(y_pred, target).ravel()
+    
+            # Build cm dict
+           cmD = {'TN'  : tn
+                   , 'FP': fp
+                   , 'FN': fn
+                   , 'TP': tp}
+            
+            # Update cv dict cmD           
+           mm_skf_scoresD[model_name].update(cmD)
+            
+        #=============================================
+        # Option: Add targety numbers for data
+        #=============================================
+        if add_yn:   
+            tnD = {}
+                       
+            # Build tn numbers dict
+            tnD = {'n_trainingY_neg'    : Counter(target)[0]
+                   , 'n_trainingY_pos'  : Counter(target)[1] }
+            
+            # Update cv dict with cmD and tnD
+            mm_skf_scoresD[model_name].update(tnD)
+
+#%%
+        #=========================
+        # Option: Blind test (bts)
+        #=========================
+        if run_blind_test:
+           btD = {}
+           
+           # Build bts numbers dict
+           btD = {'n_blindY_neg'    : Counter(blind_test_target)[0]
+                  , 'n_blindY_pos'  : Counter(blind_test_target)[1]
+                  , 'n_testY_ratio' : round(Counter(blind_test_target)[0]/Counter(blind_test_target)[1], 2)
+                  , 'n_test_size'   : len(blind_test_df) }
+           
+           # Update cmD+tnD dicts with btD
+           mm_skf_scoresD[model_name].update(btD)
+                    
+           #--------------------------------------------------------
+           # Build the final results with all scores for the model
+           #--------------------------------------------------------
+           #bts_predict = gscv_fs.predict(blind_test_df)
+           model_pipeline.fit(input_df, target)
+           bts_predict = model_pipeline.predict(blind_test_df)
+           
+           bts_mcc_score = round(matthews_corrcoef(blind_test_target, bts_predict),2)
+           print('\nMCC on Blind test:'     , bts_mcc_score)
+           print('\nAccuracy on Blind test:', round(accuracy_score(blind_test_target, bts_predict),2))
+           
+           mm_skf_scoresD[model_name]['bts_mcc']       = bts_mcc_score
+           mm_skf_scoresD[model_name]['bts_fscore']    = round(f1_score(blind_test_target, bts_predict),2)
+           mm_skf_scoresD[model_name]['bts_precision'] = round(precision_score(blind_test_target, bts_predict),2)
+           mm_skf_scoresD[model_name]['bts_recall']    = round(recall_score(blind_test_target, bts_predict),2)
+           mm_skf_scoresD[model_name]['bts_accuracy']  = round(accuracy_score(blind_test_target, bts_predict),2)
+           mm_skf_scoresD[model_name]['bts_roc_auc']   = round(roc_auc_score(blind_test_target, bts_predict),2)
+           mm_skf_scoresD[model_name]['bts_jcc']       = round(jaccard_score(blind_test_target, bts_predict),2)
+           #mm_skf_scoresD[model_name]['diff_mcc']      = train_test_diff_MCC
+#%%
+        # ADD more info: meta data related to input and blind and resampling
+    
+        # target numbers: training
+        yc1           = Counter(target)
+        yc1_ratio     = yc1[0]/yc1[1]
+    
+        # target numbers: test
+        yc2       = Counter(blind_test_target)
+        yc2_ratio = yc2[0]/yc2[1]
+    
+        mm_skf_scoresD[model_name]['resampling']        = resampling_type
+        
+        mm_skf_scoresD[model_name]['n_training_size']   = len(input_df)
+        mm_skf_scoresD[model_name]['n_trainingY_ratio'] = round(yc1_ratio, 2)
+       
+        mm_skf_scoresD[model_name]['n_test_size']     = len(blind_test_df)
+        mm_skf_scoresD[model_name]['n_testY_ratio']   = round(yc2_ratio,2)
+        mm_skf_scoresD[model_name]['n_features']      = len(input_df.columns)
+        mm_skf_scoresD[model_name]['tts_split']       = tts_split_type
+
+    #return(mm_skf_scoresD)
+    #============================
+    # Process the dict to have WF
+    #============================
+    if return_formatted_output:
+        CV_BT_metaDF = ProcessMultModelsCl(mm_skf_scoresD)
+        return(CV_BT_metaDF)
+    else:
+        return(mm_skf_scoresD)
+
+#%% Process output function ###################################################
+############################
+# ProcessMultModelsCl() 
+############################
+#Processes the dict from above if use_formatted_output = True 
+
+def ProcessMultModelsCl(inputD = {}, blind_test_data = True):
+    
+    scoresDF = pd.DataFrame(inputD)
+    
+    #------------------------
+    #  Extracting split_name
+    #-----------------------
+    tts_split_nameL = []
+    for k,v in inputD.items():
+        tts_split_nameL = tts_split_nameL + [v['tts_split']]
+    
+    if len(set(tts_split_nameL)) == 1:
+        tts_split_name = str(list(set(tts_split_nameL))[0])
+        print('\nExtracting tts_split_name:', tts_split_name)
+    
+    #----------------------
+    #  WF: CV results
+    #----------------------
+    scoresDFT = scoresDF.T
+    
+    scoresDF_CV = scoresDFT.filter(regex='^test_.*$', axis = 1); scoresDF_CV.columns
+    # map colnames for consistency to allow concatenting
+    scoresDF_CV.columns = scoresDF_CV.columns.map(scoreCV_mapD); scoresDF_CV.columns
+    scoresDF_CV['source_data'] = 'CV'
+    
+    #----------------------
+    #  WF: Meta data 
+    #----------------------
+    metaDF = scoresDFT.filter(regex='^(?!test_.*$|bts_.*$|train_.*$).*'); metaDF.columns
+    
+    print('\nTotal cols in each df:'
+          , '\nCV df:', len(scoresDF_CV.columns)
+          , '\nmetaDF:', len(metaDF.columns))
+    
+    #-------------------------------------
+    # Combine WF: CV + Metadata
+    #-------------------------------------
+
+    combDF = pd.merge(scoresDF_CV, metaDF, left_index = True, right_index = True)  
+    print('\nAdding column: Model_name')
+    combDF['Model_name'] = combDF.index
+
+    #----------------------
+    #  WF: BTS results
+    #----------------------      
+    if blind_test_data:
+          
+        scoresDF_BT = scoresDFT.filter(regex='^bts_.*$', axis = 1); scoresDF_BT.columns
+        # map colnames for consistency to allow concatenting
+        scoresDF_BT.columns = scoresDF_BT.columns.map(scoreBT_mapD); scoresDF_BT.columns
+        scoresDF_BT['source_data'] = 'BT'
+    
+    
+        print('\nTotal cols in bts df:'
+              , '\nBT_df:', len(scoresDF_BT.columns))
+        
+        if  len(scoresDF_CV.columns) == len(scoresDF_BT.columns):
+            print('\nFirst proceeding to rowbind CV and BT dfs:')
+            expected_ncols_out = len(scoresDF_BT.columns) + len(metaDF.columns)
+            print('\nFinal output should have:', expected_ncols_out, 'columns' )
+
+        #-----------------
+        # Combine WF
+        #-----------------
+        dfs_combine_wf = [scoresDF_CV, scoresDF_BT]
+    
+        print('\nCombinig', len(dfs_combine_wf), 'using pd.concat by row ~ rowbind'
+              , '\nChecking Dims of df to combine:'
+              , '\nDim of CV:', scoresDF_CV.shape
+              , '\nDim of BT:', scoresDF_BT.shape)
+        #print(scoresDF_CV)
+        #print(scoresDF_BT)
+    
+        dfs_nrows_wf = []
+        for df in dfs_combine_wf:
+            dfs_nrows_wf = dfs_nrows_wf + [len(df)]
+        dfs_nrows_wf = max(dfs_nrows_wf)
+            
+        dfs_ncols_wf = []
+        for df in dfs_combine_wf:
+            dfs_ncols_wf = dfs_ncols_wf + [len(df.columns)]
+        dfs_ncols_wf = max(dfs_ncols_wf)
+        print(dfs_ncols_wf)
+        
+        expected_nrows_wf = len(dfs_combine_wf) * dfs_nrows_wf
+        expected_ncols_wf = dfs_ncols_wf
+        
+        common_cols_wf = list(set.intersection(*(set(df.columns) for df in dfs_combine_wf)))
+        print('\nNumber of Common columns:', dfs_ncols_wf
+              , '\nThese are:', common_cols_wf)
+    
+        if len(common_cols_wf) == dfs_ncols_wf :
+            combined_baseline_wf = pd.concat([df[common_cols_wf] for df in dfs_combine_wf], ignore_index=False)
+            print('\nConcatenating dfs with different resampling methods [WF]:'
+                  , '\nSplit type:', tts_split_name
+                  , '\nNo. of dfs combining:', len(dfs_combine_wf))
+            #print('\n================================================^^^^^^^^^^^^')
+            if len(combined_baseline_wf) == expected_nrows_wf  and len(combined_baseline_wf.columns) == expected_ncols_wf:
+                #print('\n================================================^^^^^^^^^^^^')
+    
+                print('\nPASS:', len(dfs_combine_wf), 'dfs successfully combined'
+                      , '\nnrows in combined_df_wf:', len(combined_baseline_wf)
+                      , '\nncols in combined_df_wf:', len(combined_baseline_wf.columns))
+            else:
+                print('\nFAIL: concatenating failed'
+                      , '\nExpected nrows:', expected_nrows_wf
+                      , '\nGot:', len(combined_baseline_wf)
+                      , '\nExpected ncols:', expected_ncols_wf
+                      , '\nGot:', len(combined_baseline_wf.columns))
+                sys.exit('\nFIRST IF FAILS')
+            ##
+            c1L = list(set(combined_baseline_wf.index))
+            c2L = list(metaDF.index)
+
+            #if set(c1L) == set(c2L):
+            if set(c1L) == set(c2L) and all(x in c2L for x in c1L) and all(x in c1L for x in c2L):
+                print('\nPASS: proceeding to merge metadata with CV and BT dfs')
+                combDF = pd.merge(combined_baseline_wf, metaDF, left_index = True, right_index = True)
+                print('\nAdding column: Model_name')
+                combDF['Model_name'] = combDF.index
+                
+            else:
+                sys.exit('\nFAIL: Could not merge metadata with CV and BT dfs')
+
+        else:
+            print('\nConcatenting dfs not possible [WF],check numbers ')    
+
+        #-------------------------------------
+        # Combine WF+Metadata: Final output
+        #-------------------------------------
+       
+        # if len(combDF.columns) == expected_ncols_out:
+        #     print('\nPASS: Combined df has expected ncols')
+        # else:
+        #     sys.exit('\nFAIL: Length mismatch for combined_df')
+            
+        # print('\nAdding column: Model_name')
+        # combDF['Model_name'] = combDF.index
+        
+        print('\n========================================================='
+              , '\nSUCCESS: Ran multiple classifiers'
+              , '\n=======================================================')
+
+    #resampling_methods_wf = combined_baseline_wf[['resampling']]
+    #resampling_methods_wf = resampling_methods_wf.drop_duplicates()
+              #, '\n', resampling_methods_wf)
+
+    return combDF
+
+###############################################################################
--- a/scripts/ml/combined_model/untitled0.py
+++ b/scripts/ml/combined_model/untitled0.py
@ -39,16 +39,21 @@ from sklearn.pipeline import Pipeline, make_pipeline
 import argparse
 import re
 homedir = os.path.expanduser("~")
-#%% Globals
+#%% GLOBALS
 rs = {'random_state': 42}
 njobs = {'n_jobs': 10}
+
 #%% Define split_tts function #################################################
 def split_tts(ml_input_data
-              , data_type      = ['actual', 'complete', 'reverse']
+              , data_type      = ['actual', 'complete']
              , split_type     = ['70_30', '80_20', 'sl']
              , oversampling   = True
              , dst_colname    = 'dst'# determine how to subset the actual vs reverse data
-              , target_colname = 'dst_mode'):
+              , target_colname = 'dst_mode'
+              , include_gene_name = True
+              , k_smote = 5):
+    
+    outDict = {}
    
    print('\nInput params:' 
          , '\nDim of input df:'   , ml_input_data.shape
@ -60,6 +65,11 @@ def split_tts(ml_input_data
        print('\noversampling enabled')
    else:
        print('\nNot generating oversampled or undersampled data')
+    
+    if include_gene_name:
+        cols_to_dropL = []
+    else:
+        cols_to_dropL = ['gene_name']

    #====================================
    # evaluating use_data_type
@ -68,21 +78,26 @@ def split_tts(ml_input_data
        ml_data = ml_input_data[ml_input_data[dst_colname].notna()]
    if data_type == 'complete':
        ml_data = ml_input_data.copy()
-    if data_type == 'reverse':
-        ml_data = ml_input_data[ml_input_data[dst_colname].isna()]
-    #if_data_type == none
        
    #====================================
    # separate features and target
    #====================================
-    x_features = ml_data.drop([target_colname, dst_colname], axis = 1)
-    y_target   = ml_data[target_colname]
+    cols_to_dropL = cols_to_dropL + [target_colname, dst_colname]
+    x_features    = ml_data.drop(cols_to_dropL, axis = 1)
+    y_target      = ml_data[target_colname]
        
    # sanity check
-    if not 'dst_mode' in x_features.columns:
-        print('\nPASS: x_features has no target variable')
+    check1 = x_features[[i for i in cols_to_dropL if i in x_features.columns]]
+    
+    #if not 'dst_mode' in x_features.columns:
+    if check1.empty:
+        print('\nPASS: x_features has no target variable and no dst column'
+              , '\nDropped cols:', len(cols_to_dropL)
+              , '\nThese were:', target_colname,'and', dst_colname)
        x_ncols = len(x_features.columns)
-        print('\nNo. of columns for x_features:', x_ncols)
+        print('\nNo. of cols in input df:', len(ml_input_data.columns)
+              , '\nNo.of cols dropped:', len(cols_to_dropL)
+              , '\nNo. of columns for x_features:', x_ncols)
    else:
        sys.exit('\nFAIL: x_features has target variable included. FIX it and rerun!')
        
@ -129,7 +144,12 @@ def split_tts(ml_input_data
          , '\n\nTotal no. of input features:'      , len(X.columns)
          , '\n--------No. of numerical features:'  , len(numerical_cols)
          , '\n--------No. of categorical features:', len(categorical_cols)
-      
+          
+          , '\n==========================='
+          , '\n Resampling: NONE'
+          , '\nBaseline'
+          , '\n==========================='
+          
          , '\n\nTotal data size:', len(X) + len(X_bts)
    
          , '\n\nTrain data size:', X.shape
@ -140,11 +160,15 @@ def split_tts(ml_input_data
    
          , '\n\ny_train ratio:',yc1_ratio
          , '\ny_test ratio:', yc2_ratio
-          , '\n-------------------------------------------------------------'
-      )
+          , '\n-------------------------------------------------------------')
+    
+    outDict.update({'X'       : X
+            , 'X_bts' : X_bts
+            , 'y'     : y
+            , 'y_bts' : y_bts
+            } ) 
    
    if oversampling:
-        
        #######################################################################
        #                               RESAMPLING
        #######################################################################
@ -194,28 +218,70 @@ def split_tts(ml_input_data
        categorical_colind = X.columns.get_indexer(list(categorical_ix))
        categorical_colind
        
-        k_sm = 5 # default
+        #k_sm = 5 # default
+        k_sm = k_smote
        sm_nc = SMOTENC(categorical_features=categorical_colind, k_neighbors = k_sm, **rs, **njobs)
        X_smnc, y_smnc = sm_nc.fit_resample(X, y)
        print('\nSMOTE_NC OverSampling\n', Counter(y_smnc))
        print(X_smnc.shape)
        
-        print('\nGenerated resampled data as below:'
-            , '\n==========================='
-            , '\nRandom oversampling:'
-            , '\n==========================='
+        print('\nGenerated Resampled data as below:'
+            , '\n================================='
+            , '\nResampling: Random oversampling'
+            , '\n================================'
             
            , '\n\nTrain data size:', X_ros.shape
-     
-            , '\ny_train numbers:', y_ros
-            , '\n\ny_train ratio:', Counter(y_ros)[0]/Counter(y_ros)[0]
+            , '\ny_train numbers:', len(y_ros)
+            , '\n\ny_train ratio:', Counter(y_ros)[0]/Counter(y_ros)[1]
            
            , '\ny_test ratio:' , yc2_ratio
+            ##################################################################
+            , '\n================================'
+            , '\nResampling: Random underampling'
+            , '\n================================'
            
-          , '\n-------------------------------------------------------------'
-      )
-        
+            , '\n\nTrain data size:', X_rus.shape
+            , '\ny_train numbers:', len(y_rus)
+            , '\n\ny_train ratio:', Counter(y_rus)[0]/Counter(y_rus)[1]
+            
+            , '\ny_test ratio:' , yc2_ratio
+            ##################################################################
+            , '\n================================'
+            , '\nResampling:Combined (over+under)'
+            , '\n================================'
+                                    
+            , '\n\nTrain data size:', X_rouC.shape
+            , '\ny_train numbers:', len(y_rouC)
+            , '\n\ny_train ratio:', Counter(y_rouC)[0]/Counter(y_rouC)[1]
+            
+            , '\ny_test ratio:' , yc2_ratio
+            ##################################################################            
+            , '\n=============================='
+            , '\nResampling: Smote NC'
+            , '\n=============================='
+            
+            , '\n\nTrain data size:', X_smnc.shape
+            , '\ny_train numbers:', len(y_smnc)
+            , '\n\ny_train ratio:', Counter(y_smnc)[0]/Counter(y_smnc)[1]
+            
+            , '\ny_test ratio:' , yc2_ratio
+            ##################################################################
+           , '\n-------------------------------------------------------------')
+
+        outDict.update({'X_ros'   : X_ros
+                        , 'y_ros' : y_ros
+                            
+                        , 'X_rus' : X_rus
+                        , 'y_rus' : y_rus
+                            
+                        , 'X_rouC': X_rouC
+                        , 'y_rouC': y_rouC
+                            
+                        , 'X_smnc': X_smnc
+                        , 'y_smnc': y_smnc})
+        return(outDict)
        
     #   globals().update(locals()) # TROLOLOLOLOLOLS
-        
-    #return()
+     
+    else:
+        return(outDict)
--- a/scripts/ml/run_7030.py
+++ b/scripts/ml/run_7030.py
@ -1,141 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Mon Jun 20 13:05:23 2022
-
-@author: tanu
-"""
-#%%Imports ####################################################################
-import re
-import argparse
-import os, sys
-
-# gene  = 'pncA'
-# drug  = 'pyrazinamide'
-#total_mtblineage_uc = 8
-###############################################################################
-#%% command line args: case sensitive
-arg_parser = argparse.ArgumentParser()
-arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '')
-arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '') 
-args = arg_parser.parse_args()
-
-drug    = args.drug
-gene    = args.gene
-
-###############################################################################
-homedir = os.path.expanduser("~")
-sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml')
-
-###############################################################################
-#==================
-# Import data
-#==================
-from ml_data_7030 import *
-setvars(gene,drug)
-from ml_data_7030 import *
-
-# from YC run_all_ML: run locally
-#from UQ_yc_RunAllClfs import run_all_ML
-
-#====================
-# Import ML functions 
-#====================
-from MultClfs import *
-
-#==================
-# other vars
-#==================
-tts_split_7030    = '70_30'
-OutFile_suffix  = '7030'
-
-#==================
-# Specify outdir 
-#==================
-outdir_ml = outdir + 'ml/tts_7030/'
-print('\nOutput directory:', outdir_ml)
-
-#outFile_wf = outdir_ml + gene.lower() + '_baselineC_' + OutFile_suffix + '.csv'
-outFile_wf = outdir_ml + gene.lower() + '_baselineC_noOR' + OutFile_suffix + '.csv'
-#%% Running models ############################################################
-print('\n#####################################################################\n'
-      , '\nStarting--> Running ML analysis: Baseline modes (No FS)'
-      , '\nGene name:', gene
-      , '\nDrug name:', drug
-      , '\n#####################################################################\n')
-
-paramD = {
-        'baseline_paramD': { 'input_df'        : X
-                            , 'target'         : y
-                            , 'var_type'       : 'mixed'
-                            , 'resampling_type': 'none'}
-        
-        , 'smnc_paramD': { 'input_df'          : X_smnc
-                          , 'target'           : y_smnc
-                          , 'var_type'         : 'mixed'
-                          , 'resampling_type'  : 'smnc'}
-    
-        , 'ros_paramD': { 'input_df'           : X_ros
-                        , 'target'             : y_ros
-                        , 'var_type'           : 'mixed'
-                        , 'resampling_type'    : 'ros'}
-
-        , 'rus_paramD' : { 'input_df'          : X_rus
-                          , 'target'           : y_rus
-                          , 'var_type'         : 'mixed'
-                          , 'resampling_type'  : 'rus'}
-
-        , 'rouC_paramD' : { 'input_df'         : X_rouC
-                            , 'target'          : y_rouC
-                            , 'var_type'        : 'mixed'
-                            , 'resampling_type' : 'rouC'}
-        }
-
-##==============================================================================
-## Dict with no CV BT formatted df
-## mmD = {}
-## for k, v in paramD.items():
-## #    print(mmD[k])
-##     scores_7030D = MultModelsCl(**paramD[k]
-##                         , tts_split_type = tts_split_7030
-##                         , skf_cv = skf_cv
-##                         , blind_test_df = X_bts
-##                         , blind_test_target = y_bts
-##                         , add_cm = True 
-##                         , add_yn = True
-##                         , return_formatted_output = False)
-##     mmD[k] = scores_7030D
-##==============================================================================
-## Initial run to get the dict of dicts for each sampling type containing CV, BT and metadata DFs 
-mmDD = {}
-for k, v in paramD.items():
-    scores_7030D = MultModelsCl(**paramD[k]
-                        , tts_split_type = tts_split_7030
-                        , skf_cv = skf_cv
-                        , blind_test_df = X_bts
-                        , blind_test_target = y_bts
-                        , add_cm = True 
-                        , add_yn = True
-                        , return_formatted_output = True)
-    mmDD[k] = scores_7030D
-
-# Extracting the dfs from within the dict and concatenating to output as one df
-for k, v in mmDD.items():
-    out_wf_7030 = pd.concat(mmDD, ignore_index = True)
-
-out_wf_7030f = out_wf_7030.sort_values(by = ['resampling', 'source_data', 'MCC'], ascending = [True, True, False], inplace = False)
-    
-print('\n######################################################################'
-      , '\nEnd--> Successfully generated output DF for Multiple classifiers (baseline models)'
-      , '\nGene:', gene.lower()
-      , '\nDrug:', drug
-      , '\noutput file:', outFile_wf
-      , '\nDim of output:', out_wf_7030f.shape
-      , '\n######################################################################')
-###############################################################################
-#====================
-# Write output file
-#====================
-out_wf_7030f.to_csv(outFile_wf, index = False)
-print('\nFile successfully written:', outFile_wf)
-###############################################################################
--- a/scripts/ml/run_7030_LOOP.py
+++ b/scripts/ml/run_7030_LOOP.py
@ -1,126 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Mon Jun 20 13:05:23 2022
-
-@author: tanu
-"""
-#%%Imports ####################################################################
-import re
-import argparse
-import os, sys
-import collections
-
-# gene  = 'pncA'
-# drug  = 'pyrazinamide'
-#total_mtblineage_uc = 8
-###############################################################################
-#%% command line args: case sensitive
-# arg_parser = argparse.ArgumentParser()
-# arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '')
-# arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '') 
-# args = arg_parser.parse_args()
-
-# drug    = args.drug
-# gene    = args.gene
-
-###############################################################################
-homedir = os.path.expanduser("~")
-sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml')
-
-###############################################################################
-#==================
-# Import data
-#==================
-from ml_data_7030 import *
-setvars(gene,drug)
-from ml_data_7030 import *
-
-# from YC run_all_ML: run locally
-#from UQ_yc_RunAllClfs import run_all_ML
-
-#====================
-# Import ML functions 
-#====================
-from MultClfs import *
-
-#==================
-# other vars
-#==================
-tts_split_7030    = '70_30'
-OutFile_suffix  = '7030'
-
-#==================
-# Specify outdir 
-#==================
-outdir_ml = outdir + 'ml/tts_7030/'
-print('\nOutput directory:', outdir_ml)
-
-outFile_wf = outdir_ml + gene.lower() + '_baselineC_' + OutFile_suffix + '.csv'
-#outFile_lf = outdir_ml + gene.lower() + '_baselineC_ext_' + OutFile_suffix + '.csv'
-
-#%% Running models ############################################################
-print('\n#####################################################################\n'
-      , '\nStarting--> Running ML analysis: Baseline modes (No FS)'
-      , '\nGene name:', gene
-      , '\nDrug name:', drug
-      , '\n#####################################################################\n')
-
-paramD = {
-        'baseline_paramD': { 'input_df'        : X
-                            , 'target'         : y
-                            , 'var_type'       : 'mixed'
-                            , 'resampling_type': 'none'}
-        
-        , 'smnc_paramD': { 'input_df'          : X_smnc
-                          , 'target'           : y_smnc
-                          , 'var_type'         : 'mixed'
-                          , 'resampling_type'  : 'smnc'}
-    
-        , 'ros_paramD': { 'input_df'           : X_ros
-                        , 'target'             : y_ros
-                        , 'var_type'           : 'mixed'
-                        , 'resampling_type'    : 'ros'}
-
-        , 'rus_paramD' : { 'input_df'          : X_rus
-                          , 'target'           : y_rus
-                          , 'var_type'         : 'mixed'
-                          , 'resampling_type'  : 'rus'}
-
-        , 'rouC_paramD' : { 'input_df'         : X_rouC
-                            , 'target'          : y_rouC
-                            , 'var_type'        : 'mixed'
-                            , 'resampling_type' : 'rouC'}
-        }
-
-# Initial run to get the dict containing CV, BT and metadata DFs 
-mmD = {}
-for k, v in paramD.items():
-#    print(fooD[k])
-    scores_7030D = MultModelsCl(**paramD[k]
-                        , tts_split_type = tts_split_7030
-                        , skf_cv = skf_cv
-                        , blind_test_df = X_bts
-                        , blind_test_target = y_bts
-                        , add_cm = True 
-                        , add_yn = True
-                        , return_formatted_output = True)
-    mmD[k] = scores_7030D
-    
-for k, v in mmD.items():
-    out_wf_7030 = pd.concat(mmD, ignore_index = True)
-    
-print('\n######################################################################'
-      , '\nEnd--> Successfully generated output DF for Multiple classifiers (baseline models)'
-      , '\nGene:', gene.lower()
-      , '\nDrug:', drug
-      , '\noutput file:', outFile_wf
-      , '\nDim of output:', out_wf_7030.shape
-      , '\n######################################################################')
-###############################################################################
-#====================
-# Write output file
-#====================
-out_wf_7030.to_csv(outFile_wf, index = False)
-print('\nFile successfully written:', outFile_wf)
-###############################################################################
--- a/scripts/ml/run_8020.py
+++ b/scripts/ml/run_8020.py
@ -1,141 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Mon Jun 20 13:05:23 2022
-
-@author: tanu
-"""
-#%%Imports ####################################################################
-import re
-import argparse
-import os, sys
-
-# gene  = 'pncA'
-# drug  = 'pyrazinamide'
-#total_mtblineage_uc = 8
-###############################################################################
-#%% command line args: case sensitive
-arg_parser = argparse.ArgumentParser()
-arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '')
-arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '') 
-args = arg_parser.parse_args()
-
-drug    = args.drug
-gene    = args.gene
-
-###############################################################################
-homedir = os.path.expanduser("~")
-sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml')
-
-###############################################################################
-#==================
-# Import data
-#==================
-from ml_data_8020 import *
-setvars(gene,drug)
-from ml_data_8020 import *
-
-# from YC run_all_ML: run locally
-#from UQ_yc_RunAllClfs import run_all_ML
-
-#====================
-# Import ML functions 
-#====================
-from MultClfs import *
-
-#==================
-# other vars
-#==================
-tts_split_8020    = '80_20'
-OutFile_suffix  = '8020'
-
-#==================
-# Specify outdir 
-#==================
-outdir_ml = outdir + 'ml/tts_8020/'
-print('\nOutput directory:', outdir_ml)
-
-#outFile_wf = outdir_ml + gene.lower() + '_baselineC_' + OutFile_suffix + '.csv'
-outFile_wf = outdir_ml + gene.lower() + '_baselineC_noOR' + OutFile_suffix + '.csv'
-#%% Running models ############################################################
-print('\n#####################################################################\n'
-      , '\nStarting--> Running ML analysis: Baseline modes (No FS)'
-      , '\nGene name:', gene
-      , '\nDrug name:', drug
-      , '\n#####################################################################\n')
-
-paramD = {
-        'baseline_paramD': { 'input_df'        : X
-                            , 'target'         : y
-                            , 'var_type'       : 'mixed'
-                            , 'resampling_type': 'none'}
-        
-        , 'smnc_paramD': { 'input_df'          : X_smnc
-                          , 'target'           : y_smnc
-                          , 'var_type'         : 'mixed'
-                          , 'resampling_type'  : 'smnc'}
-    
-        , 'ros_paramD': { 'input_df'           : X_ros
-                        , 'target'             : y_ros
-                        , 'var_type'           : 'mixed'
-                        , 'resampling_type'    : 'ros'}
-
-        , 'rus_paramD' : { 'input_df'          : X_rus
-                          , 'target'           : y_rus
-                          , 'var_type'         : 'mixed'
-                          , 'resampling_type'  : 'rus'}
-
-        , 'rouC_paramD' : { 'input_df'         : X_rouC
-                            , 'target'          : y_rouC
-                            , 'var_type'        : 'mixed'
-                            , 'resampling_type' : 'rouC'}
-        }
-
-##==============================================================================
-## Dict with no CV BT formatted df
-## mmD = {}
-## for k, v in paramD.items():
-## #    print(mmD[k])
-##     scores_8020D = MultModelsCl(**paramD[k]
-##                         , tts_split_type = tts_split_8020
-##                         , skf_cv = skf_cv
-##                         , blind_test_df = X_bts
-##                         , blind_test_target = y_bts
-##                         , add_cm = True 
-##                         , add_yn = True
-##                         , return_formatted_output = False)
-##     mmD[k] = scores_8020D
-##==============================================================================
-## Initial run to get the dict of dicts for each sampling type containing CV, BT and metadata DFs 
-mmDD = {}
-for k, v in paramD.items():
-    scores_8020D = MultModelsCl(**paramD[k]
-                        , tts_split_type = tts_split_8020
-                        , skf_cv = skf_cv
-                        , blind_test_df = X_bts
-                        , blind_test_target = y_bts
-                        , add_cm = True 
-                        , add_yn = True
-                        , return_formatted_output = True)
-    mmDD[k] = scores_8020D
-
-# Extracting the dfs from within the dict and concatenating to output as one df
-for k, v in mmDD.items():
-    out_wf_8020 = pd.concat(mmDD, ignore_index = True)
-
-out_wf_8020f = out_wf_8020.sort_values(by = ['resampling', 'source_data', 'MCC'], ascending = [True, True, False], inplace = False)
-    
-print('\n######################################################################'
-      , '\nEnd--> Successfully generated output DF for Multiple classifiers (baseline models)'
-      , '\nGene:', gene.lower()
-      , '\nDrug:', drug
-      , '\noutput file:', outFile_wf
-      , '\nDim of output:', out_wf_8020f.shape
-      , '\n######################################################################')
-###############################################################################
-#====================
-# Write output file
-#====================
-out_wf_8020f.to_csv(outFile_wf, index = False)
-print('\nFile successfully written:', outFile_wf)
-###############################################################################
--- a/scripts/ml/run_FS.py
+++ b/scripts/ml/run_FS.py
@ -1,255 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Tue May 24 08:11:05 2022
-
-@author: tanu
-"""
-#%%
-import os, sys
-import pandas as pd
-import numpy as np
-import pprint as pp
-from copy import deepcopy
-from sklearn import linear_model
-from sklearn import datasets
-from collections import Counter
-
-from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
-from sklearn.linear_model import RidgeClassifier, RidgeClassifierCV, SGDClassifier, PassiveAggressiveClassifier
-
-from sklearn.naive_bayes import BernoulliNB
-from sklearn.neighbors import KNeighborsClassifier
-from sklearn.svm import SVC
-from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
-from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier
-from sklearn.naive_bayes import GaussianNB
-from sklearn.gaussian_process import GaussianProcessClassifier, kernels
-from sklearn.gaussian_process.kernels import RBF, DotProduct, Matern, RationalQuadratic, WhiteKernel
-
-from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
-from sklearn.neural_network import MLPClassifier
-
-from sklearn.svm import SVC
-from xgboost import XGBClassifier
-from sklearn.naive_bayes import MultinomialNB
-from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
-
-from sklearn.compose import ColumnTransformer
-from sklearn.compose import make_column_transformer
-
-from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score
-from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report
-
-# added
-from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, LeaveOneOut, KFold, RepeatedKFold, cross_val_predict
-
-from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
-from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold
-
-from sklearn.pipeline import Pipeline, make_pipeline
-
-from sklearn.feature_selection import RFE, RFECV
-
-import itertools
-import seaborn as sns
-import matplotlib.pyplot as plt
-
-from statistics import mean, stdev, median, mode
-
-from imblearn.over_sampling import RandomOverSampler
-from imblearn.under_sampling import RandomUnderSampler
-from imblearn.over_sampling import SMOTE
-from sklearn.datasets import make_classification
-from imblearn.combine import SMOTEENN
-from imblearn.combine import SMOTETomek
-
-from imblearn.over_sampling import SMOTENC
-from imblearn.under_sampling import EditedNearestNeighbours
-from imblearn.under_sampling import RepeatedEditedNearestNeighbours
-
-from sklearn.model_selection import GridSearchCV
-from sklearn.base import BaseEstimator
-from sklearn.impute import KNNImputer as KNN
-import json
-import argparse
-import re
-###############################################################################
-#gene  = 'pncA'
-#drug  = 'pyrazinamide'
-#total_mtblineage_uc = 8
-
-#%% command line args: case sensitive
-arg_parser = argparse.ArgumentParser()
-arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '')
-arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '') 
-args = arg_parser.parse_args()
-
-drug    = args.drug
-gene    = args.gene
-
-###############################################################################
-#==================
-# other vars
-#==================
-tts_split    = '70_30'
-OutFile_suffix  = '7030_FS'
-###############################################################################
-#==================
-# Import data
-#==================
-from ml_data_7030 import *
-setvars(gene,drug)
-from ml_data_7030 import *
-
-# from YC run_all_ML: run locally
-#from UQ_yc_RunAllClfs import run_all_ML
-
-#==========================================
-# Import ML function: Feature selection
-#==========================================
-# TT run all ML clfs: feature selection
-from FS import fsgs
-
-#==================
-# Specify outdir 
-#==================
-outdir_ml = outdir + 'ml/tts_7030/fs/'
-print('\nOutput directory:', outdir_ml)
-#OutFileFS = outdir_ml + gene.lower() + '_FS' + OutFile_suffix + '.json'
-OutFileFS = outdir_ml + gene.lower() + '_FS_noOR' + OutFile_suffix + '.json'
-
-############################################################################
-
-###############################################################################
-#====================
-# single model CALL
-#====================
-# aFS = fsgs(input_df = X
-#          , target = y
-#          , param_gridLd = [{'fs__min_features_to_select': [1]}]
-#          , blind_test_df = X_bts
-#          , blind_test_target = y_bts
-#          , estimator = LogisticRegression(**rs)
-#          , use_fs = False # uses estimator as the RFECV parameter for fs. Set to TRUE if you want to supply custom_fs as shown below
-#          , custom_fs = RFECV(DecisionTreeClassifier(**rs) , cv =  skf_cv, scoring = 'matthews_corrcoef')
-#          , cv_method =  skf_cv
-#          , var_type = 'mixed'
-#          )
-#############
-# Loop
-############
-# models_all = [
-#           ('XGBoost'                   , XGBClassifier(**rs, **njobs
-#                                                        , n_estimators = 100 # wasn't there
-#                                                        , max_depyth = 3 # wasn't there
-#                                                        , verbosity = 3
-#                                                        #, use_label_encoder = False)
-#                                                        ) )
-# ]
-
-models = [('AdaBoost Classifier'     , AdaBoostClassifier(**rs) )
-          ##, ('Bagging Classifier'        , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True) )
-          , ('Decision Tree'             , DecisionTreeClassifier(**rs) ) 
-          , ('Extra Tree'                , ExtraTreeClassifier(**rs) )
-          , ('Extra Trees'               , ExtraTreesClassifier(**rs) ) 
-          , ('Gradient Boosting'         , GradientBoostingClassifier(**rs) )
-          ##, ('Gaussian NB'               , GaussianNB() )
-          ##, ('Gaussian Process'          , GaussianProcessClassifier(**rs) )
-          ##, ('K-Nearest Neighbors'       , KNeighborsClassifier() ) 
-          , ('LDA'                       , LinearDiscriminantAnalysis() )
-          , ('Logistic Regression'       , LogisticRegression(**rs) )
-          , ('Logistic RegressionCV'     , LogisticRegressionCV(cv = 3, **rs))
-          ##, ('MLP'                       , MLPClassifier(max_iter = 500, **rs) ) 
-          ##, ('Multinomial'               , MultinomialNB() )
-          ##, ('Naive Bayes'               , BernoulliNB() )
-          , ('Passive Aggresive'         , PassiveAggressiveClassifier(**rs, **njobs) )
-          ##, ('QDA'                       , QuadraticDiscriminantAnalysis() )
-          , ('Random Forest'             , RandomForestClassifier(**rs, n_estimators = 1000 ) ) 
-          , ('Random Forest2'            , RandomForestClassifier(min_samples_leaf = 5
-                                                                 , n_estimators     = 1000
-                                                                 , bootstrap        = True
-                                                                 , oob_score        = True
-                                                                 , **njobs
-                                                                 , **rs
-                                                                 , max_features     = 'auto') ) 
-          , ('Ridge Classifier'          , RidgeClassifier(**rs)  )
-          , ('Ridge ClassifierCV'        , RidgeClassifierCV(cv = 3) )          
-          ##, ('SVC'                       , SVC(**rs) ) 
-          , ('Stochastic GDescent'       , SGDClassifier(**rs, **njobs) )
-          ## , ('XGBoost'                   , XGBClassifier(**rs, **njobs, verbosity = 3
-          ##                                                , use_label_encoder = False) )
-          ]
-
-print('\n#####################################################################'
-      , '\nRunning Feature Selection using classfication models (n):', len(models)
-      , '\nGene:'  , gene.lower()
-      , '\nDrug:'  , drug
-      , '\nSplit:' , tts_split
-      ,'\n####################################################################')
-
-for m in models:
-    print(m)
-print('\n====================================================================\n')
-
-out_fsD = {}
-index = 1
-for model_name, model_fn in models:
-    print('\nRunning classifier with FS:', index
-          , '\nModel_name:'               , model_name
-          , '\nModel func:'               , model_fn)
-          #, '\nList of models:', models)
-    index = index+1
-    
-    out_fsD[model_name] = fsgs(input_df = X
-              , target = y
-              , param_gridLd = [{'fs__min_features_to_select': [1]}]
-              , blind_test_df = X_bts
-              , blind_test_target = y_bts
-              , estimator = model_fn
-              , use_fs = False # uses estimator as the RFECV parameter for fs. Set to TRUE if you want to supply custom_fs as shown below
-              , custom_fs = RFECV(DecisionTreeClassifier(**rs) , cv =  skf_cv, scoring = 'matthews_corrcoef')
-              , cv_method =  skf_cv
-              , var_type = 'mixed'
-              )
-out_fsD
-#%% Checking results dict    
-tot_Ditems = sum(len(v) for v in out_fsD.values())
-
-checkL = []
-for k, v in out_fsD.items():
-    l = [len(out_fsD[k])]
-    checkL = checkL + l
-    n_sD = len(checkL) # no. of subDicts
-    l_sD = list(set(checkL)) # length of each subDict
-  
-print('\nTotal no.of subdicts:', n_sD)
-if len(l_sD) == 1 and tot_Ditems == n_sD*l_sD[0]:
-    print('\nPASS: successful run for all Classifiers'
-          , '\nLength of each subdict:', l_sD)
-
-print('\nSuccessfully ran Feature selection on', len(models), 'classifiers'
-      , '\nGene:', gene.lower()
-      , '\nDrug:', drug
-      , '\nSplit type:', tts_split
-      , '\nTotal fs models results:', len(out_fsD)
-      , '\nTotal items in output:', sum(len(v) for v in out_fsD.values()) )
-
-
-##############################################################################
-#%% json output
-#========================================
-# Write final output file
-# https://stackoverflow.com/questions/19201290/how-to-save-a-dictionary-to-a-file
-#========================================
-# Output final dict as a json
-print('\nWriting Final output file (json):', OutFileFS)
-with open(OutFileFS, 'w') as f:
-    f.write(json.dumps(out_fsD
-#                       , cls = NpEncoder
-))
-    
-# read json
-with open(OutFileFS, 'r') as f:data = json.load(f)
-##############################################################################
-
--- a/scripts/ml/run_FS_7030.py
+++ b/scripts/ml/run_FS_7030.py
@ -1,242 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Tue May 24 08:11:05 2022
-
-@author: tanu
-"""
-#%%
-import os, sys
-import pandas as pd
-import numpy as np
-import pprint as pp
-from copy import deepcopy
-from sklearn import linear_model
-from sklearn import datasets
-from collections import Counter
-
-from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
-from sklearn.linear_model import RidgeClassifier, RidgeClassifierCV, SGDClassifier, PassiveAggressiveClassifier
-
-from sklearn.naive_bayes import BernoulliNB
-from sklearn.neighbors import KNeighborsClassifier
-from sklearn.svm import SVC
-from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
-from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier
-from sklearn.naive_bayes import GaussianNB
-from sklearn.gaussian_process import GaussianProcessClassifier, kernels
-from sklearn.gaussian_process.kernels import RBF, DotProduct, Matern, RationalQuadratic, WhiteKernel
-
-from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
-from sklearn.neural_network import MLPClassifier
-
-from sklearn.svm import SVC
-from xgboost import XGBClassifier
-from sklearn.naive_bayes import MultinomialNB
-from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
-
-from sklearn.compose import ColumnTransformer
-from sklearn.compose import make_column_transformer
-
-from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score
-from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report
-
-# added
-from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, LeaveOneOut, KFold, RepeatedKFold, cross_val_predict
-
-from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
-from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold
-
-from sklearn.pipeline import Pipeline, make_pipeline
-
-from sklearn.feature_selection import RFE, RFECV
-
-import itertools
-import seaborn as sns
-import matplotlib.pyplot as plt
-
-from statistics import mean, stdev, median, mode
-
-from imblearn.over_sampling import RandomOverSampler
-from imblearn.under_sampling import RandomUnderSampler
-from imblearn.over_sampling import SMOTE
-from sklearn.datasets import make_classification
-from imblearn.combine import SMOTEENN
-from imblearn.combine import SMOTETomek
-
-from imblearn.over_sampling import SMOTENC
-from imblearn.under_sampling import EditedNearestNeighbours
-from imblearn.under_sampling import RepeatedEditedNearestNeighbours
-
-from sklearn.model_selection import GridSearchCV
-from sklearn.base import BaseEstimator
-from sklearn.impute import KNNImputer as KNN
-import json
-import argparse
-import re
-###############################################################################
-#gene  = 'pncA'
-#drug  = 'pyrazinamide'
-#total_mtblineage_uc = 8
-
-#%% command line args: case sensitive
-arg_parser = argparse.ArgumentParser()
-arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '')
-arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '') 
-args = arg_parser.parse_args()
-
-drug    = args.drug
-gene    = args.gene
-
-###############################################################################
-#==================
-# other vars
-#==================
-tts_split    = '70_30'
-OutFile_suffix  = '7030_FS'
-###############################################################################
-homedir = os.path.expanduser("~")
-sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml')
-
-###############################################################################
-#==================
-# Import data
-#==================
-from ml_data_7030 import *
-setvars(gene,drug)
-from ml_data_7030 import *
-
-# from YC run_all_ML: run locally
-#from UQ_yc_RunAllClfs import run_all_ML
-
-#==========================================
-# Import ML functions:
-# fsgs_rfecv(): RFECV for Feature selection
-#==========================================
-from MultClfs import *
-
-#==================
-# Specify outdir 
-#==================
-outdir_ml = outdir + 'ml/tts_7030/fs/'
-print('\nOutput directory:', outdir_ml)
-#OutFileFS = outdir_ml + gene.lower() + '_FS' + OutFile_suffix + '.json'
-OutFileFS = outdir_ml + gene.lower() + '_FS_noOR' + OutFile_suffix + '.json'
-
-############################################################################
-
-###############################################################################
-#====================
-# single model CALL
-#====================
-# aFS = fsgs(input_df = X
-#          , target = y
-#          , param_gridLd = [{'fs__min_features_to_select': [1]}]
-#          , blind_test_df = X_bts
-#          , blind_test_target = y_bts
-#          , estimator = LogisticRegression(**rs)
-#          , use_fs = False # uses estimator as the RFECV parameter for fs. Set to TRUE if you want to supply custom_fs as shown below
-#          , custom_fs = RFECV(DecisionTreeClassifier(**rs) , cv =  skf_cv, scoring = 'matthews_corrcoef')
-#          , cv_method =  skf_cv
-#          , var_type = 'mixed'
-#          )
-#############
-# Loop
-############
-#models_fs = [('Decision Tree'             , DecisionTreeClassifier(**rs)) ]
-
-models_fs = [('AdaBoost Classifier'   , AdaBoostClassifier(**rs) )
-          , ('Decision Tree'             , DecisionTreeClassifier(**rs) ) 
-          , ('Extra Tree'                , ExtraTreeClassifier(**rs) )
-          , ('Extra Trees'               , ExtraTreesClassifier(**rs) ) 
-          , ('Gradient Boosting'         , GradientBoostingClassifier(**rs) )
-          , ('LDA'                       , LinearDiscriminantAnalysis() )
-          , ('Logistic Regression'       , LogisticRegression(**rs) )
-          , ('Logistic RegressionCV'     , LogisticRegressionCV(cv = 3, **rs))
-          , ('Passive Aggresive'         , PassiveAggressiveClassifier(**rs, **njobs) )
-          , ('Random Forest'             , RandomForestClassifier(**rs, n_estimators = 1000 ) ) 
-          , ('Random Forest2'            , RandomForestClassifier(min_samples_leaf = 5
-                                                                 , n_estimators = 1000
-                                                                 , bootstrap    = True
-                                                                 , oob_score    = True
-                                                                 , **njobs
-                                                                 , **rs
-                                                                 , max_features = 'auto') ) 
-          , ('Ridge Classifier'          , RidgeClassifier(**rs)  )
-          , ('Ridge ClassifierCV'        , RidgeClassifierCV(cv = 3) )          
-          , ('Stochastic GDescent'       , SGDClassifier(**rs, **njobs) )
-          ## , ('XGBoost'                   , XGBClassifier(**rs, **njobs, verbosity = 3 , use_label_encoder = False) )
-          ]
-
-print('\n#####################################################################'
-      , '\nRunning Feature Selection using classfication models_fs (n):', len(models_fs)
-      , '\nGene:'  , gene.lower()
-      , '\nDrug:'  , drug
-      , '\nSplit:' , tts_split
-      ,'\n####################################################################')
-
-for m in models_fs:
-    print(m)
-print('\n====================================================================\n')
-
-out_fsD = {}
-index = 1
-for model_name, model_fn in models_fs:
-    print('\nRunning classifier with FS:', index
-          , '\nModel_name:'               , model_name
-          , '\nModel func:'               , model_fn)
-          #, '\nList of models_fs:', models_fs)
-    index = index+1
-    
-    out_fsD[model_name] = fsgs_rfecv(input_df = X
-              , target = y
-              , param_gridLd = [{'fs__min_features_to_select': [1]}]
-              , blind_test_df = X_bts
-              , blind_test_target = y_bts
-              , estimator = model_fn
-              , use_fs = False # uses estimator as the RFECV parameter for fs. Set to TRUE if you want to supply custom_fs as shown below
-              , custom_fs = RFECV(DecisionTreeClassifier(**rs) , cv =  skf_cv, scoring = 'matthews_corrcoef')
-              , cv_method =  skf_cv
-              , var_type = 'mixed'
-              )
-out_fsD
-#%% Checking results dict    
-tot_Ditems = sum(len(v) for v in out_fsD.values())
-
-checkL = []
-for k, v in out_fsD.items():
-    l = [len(out_fsD[k])]
-    checkL = checkL + l
-    n_sD = len(checkL) # no. of subDicts
-    l_sD = list(set(checkL)) # length of each subDict
-  
-print('\nTotal no.of subdicts:', n_sD)
-if len(l_sD) == 1 and tot_Ditems == n_sD*l_sD[0]:
-    print('\nPASS: successful run for all Classifiers'
-          , '\nLength of each subdict:', l_sD)
-
-print('\nSuccessfully ran Feature selection on', len(models_fs), 'classifiers'
-      , '\nGene:', gene.lower()
-      , '\nDrug:', drug
-      , '\nSplit type:', tts_split
-      , '\nTotal fs models results:', len(out_fsD)
-      , '\nTotal items in output:', sum(len(v) for v in out_fsD.values()) )
-
-
-##############################################################################
-#%% json output
-#========================================
-# Write final output file
-# https://stackoverflow.com/questions/19201290/how-to-save-a-dictionary-to-a-file
-#========================================
-# Output final dict as a json
-print('\nWriting Final output file (json):', OutFileFS)
-with open(OutFileFS, 'w') as f:
-    f.write(json.dumps(out_fsD
-#                       , cls = NpEncoder
-))
-    
-# read json
-with open(OutFileFS, 'r') as f:data = json.load(f)
-#############################################################################
-
--- a/scripts/ml/run_cd_7030.py
+++ b/scripts/ml/run_cd_7030.py
@ -1,142 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Mon Jun 20 13:05:23 2022
-
-@author: tanu
-"""
-#%%Imports ####################################################################
-import re
-import argparse
-import os, sys
-
-# gene  = 'pncA'
-# drug  = 'pyrazinamide'
-#total_mtblineage_uc = 8
-###############################################################################
-#%% command line args: case sensitive
-arg_parser = argparse.ArgumentParser()
-arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '')
-arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '') 
-args = arg_parser.parse_args()
-
-drug    = args.drug
-gene    = args.gene
-
-###############################################################################
-homedir = os.path.expanduser("~")
-sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml')
-
-###############################################################################
-#==================
-# Import data
-#==================
-from ml_data_cd_7030 import *
-setvars(gene,drug)
-from ml_data_cd_7030 import *
-
-# from YC run_all_ML: run locally
-#from UQ_yc_RunAllClfs import run_all_ML
-
-#====================
-# Import ML functions 
-#====================
-from MultClfs import *
-
-#==================
-# other vars
-#==================
-tts_split_cd_7030    = 'cd_7030'
-OutFile_suffix  = '_cd_7030'
-
-#==================
-# Specify outdir 
-#==================
-outdir_ml = outdir + 'ml/tts_cd_7030/'
-print('\nOutput directory:', outdir_ml)
-
-#outFile_wf = outdir_ml + gene.lower() + '_baselineC_' + OutFile_suffix + '.csv'
-outFile_wf = outdir_ml + gene.lower() + '_baselineC_noOR' + OutFile_suffix + '.csv'
-
-#%% Running models ############################################################
-print('\n#####################################################################\n'
-      , '\nStarting--> Running ML analysis: Baseline modes (No FS)'
-      , '\nGene name:', gene
-      , '\nDrug name:', drug
-      , '\n#####################################################################\n')
-
-paramD = {
-        'baseline_paramD': { 'input_df'        : X
-                            , 'target'         : y
-                            , 'var_type'       : 'mixed'
-                            , 'resampling_type': 'none'}
-        
-        , 'smnc_paramD': { 'input_df'          : X_smnc
-                          , 'target'           : y_smnc
-                          , 'var_type'         : 'mixed'
-                          , 'resampling_type'  : 'smnc'}
-    
-        , 'ros_paramD': { 'input_df'           : X_ros
-                        , 'target'             : y_ros
-                        , 'var_type'           : 'mixed'
-                        , 'resampling_type'    : 'ros'}
-
-        , 'rus_paramD' : { 'input_df'          : X_rus
-                          , 'target'           : y_rus
-                          , 'var_type'         : 'mixed'
-                          , 'resampling_type'  : 'rus'}
-
-        , 'rouC_paramD' : { 'input_df'         : X_rouC
-                            , 'target'          : y_rouC
-                            , 'var_type'        : 'mixed'
-                            , 'resampling_type' : 'rouC'}
-        }
-
-##==============================================================================
-## Dict with no CV BT formatted df
-## mmD = {}
-## for k, v in paramD.items():
-## #    print(mmD[k])
-##     scores_cd_7030D = MultModelsCl(**paramD[k]
-##                         , tts_split_type = tts_split_cd_7030
-##                         , skf_cv = skf_cv
-##                         , blind_test_df = X_bts
-##                         , blind_test_target = y_bts
-##                         , add_cm = True 
-##                         , add_yn = True
-##                         , return_formatted_output = False)
-##     mmD[k] = scores_cd_7030D
-##==============================================================================
-## Initial run to get the dict of dicts for each sampling type containing CV, BT and metadata DFs 
-mmDD = {}
-for k, v in paramD.items():
-    scores_cd_7030D = MultModelsCl(**paramD[k]
-                        , tts_split_type = tts_split_cd_7030
-                        , skf_cv = skf_cv
-                        , blind_test_df = X_bts
-                        , blind_test_target = y_bts
-                        , add_cm = True 
-                        , add_yn = True
-                        , return_formatted_output = True)
-    mmDD[k] = scores_cd_7030D
-
-# Extracting the dfs from within the dict and concatenating to output as one df
-for k, v in mmDD.items():
-    out_wf_cd_7030 = pd.concat(mmDD, ignore_index = True)
-
-out_wf_cd_7030f = out_wf_cd_7030.sort_values(by = ['resampling', 'source_data', 'MCC'], ascending = [True, True, False], inplace = False)
-    
-print('\n######################################################################'
-      , '\nEnd--> Successfully generated output DF for Multiple classifiers (baseline models)'
-      , '\nGene:', gene.lower()
-      , '\nDrug:', drug
-      , '\noutput file:', outFile_wf
-      , '\nDim of output:', out_wf_cd_7030f.shape
-      , '\n######################################################################')
-###############################################################################
-#====================
-# Write output file
-#====================
-out_wf_cd_7030f.to_csv(outFile_wf, index = False)
-print('\nFile successfully written:', outFile_wf)
-###############################################################################
--- a/scripts/ml/run_cd_8020.py
+++ b/scripts/ml/run_cd_8020.py
@ -1,141 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Mon Jun 20 13:05:23 2022
-
-@author: tanu
-"""
-#%%Imports ####################################################################
-import re
-import argparse
-import os, sys
-
-# gene  = 'pncA'
-# drug  = 'pyrazinamide'
-#total_mtblineage_uc = 8
-###############################################################################
-#%% command line args: case sensitive
-arg_parser = argparse.ArgumentParser()
-arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '')
-arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '') 
-args = arg_parser.parse_args()
-
-drug    = args.drug
-gene    = args.gene
-
-###############################################################################
-homedir = os.path.expanduser("~")
-sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml')
-
-###############################################################################
-#==================
-# Import data
-#==================
-from ml_data_cd_8020 import *
-setvars(gene,drug)
-from ml_data_cd_8020 import *
-
-# from YC run_all_ML: run locally
-#from UQ_yc_RunAllClfs import run_all_ML
-
-#====================
-# Import ML functions 
-#====================
-from MultClfs import *
-
-#==================
-# other vars
-#==================
-tts_split_cd_8020    = 'cd_80_20'
-OutFile_suffix  = '_cd_8020'
-
-#==================
-# Specify outdir 
-#==================
-outdir_ml = outdir + 'ml/tts_cd_8020/'
-print('\nOutput directory:', outdir_ml)
-
-#outFile_wf = outdir_ml + gene.lower() + '_baselineC_' + OutFile_suffix + '.csv'
-outFile_wf = outdir_ml + gene.lower() + '_baselineC_noOR' + OutFile_suffix + '.csv'
-#%% Running models ############################################################
-print('\n#####################################################################\n'
-      , '\nStarting--> Running ML analysis: Baseline modes (No FS)'
-      , '\nGene name:', gene
-      , '\nDrug name:', drug
-      , '\n#####################################################################\n')
-
-paramD = {
-        'baseline_paramD': { 'input_df'        : X
-                            , 'target'         : y
-                            , 'var_type'       : 'mixed'
-                            , 'resampling_type': 'none'}
-        
-        , 'smnc_paramD': { 'input_df'          : X_smnc
-                          , 'target'           : y_smnc
-                          , 'var_type'         : 'mixed'
-                          , 'resampling_type'  : 'smnc'}
-    
-        , 'ros_paramD': { 'input_df'           : X_ros
-                        , 'target'             : y_ros
-                        , 'var_type'           : 'mixed'
-                        , 'resampling_type'    : 'ros'}
-
-        , 'rus_paramD' : { 'input_df'          : X_rus
-                          , 'target'           : y_rus
-                          , 'var_type'         : 'mixed'
-                          , 'resampling_type'  : 'rus'}
-
-        , 'rouC_paramD' : { 'input_df'         : X_rouC
-                            , 'target'          : y_rouC
-                            , 'var_type'        : 'mixed'
-                            , 'resampling_type' : 'rouC'}
-        }
-
-##==============================================================================
-## Dict with no CV BT formatted df
-## mmD = {}
-## for k, v in paramD.items():
-## #    print(mmD[k])
-##     scores_cd_8020D = MultModelsCl(**paramD[k]
-##                         , tts_split_type = tts_split_cd_8020
-##                         , skf_cv = skf_cv
-##                         , blind_test_df = X_bts
-##                         , blind_test_target = y_bts
-##                         , add_cm = True 
-##                         , add_yn = True
-##                         , return_formatted_output = False)
-##     mmD[k] = scores_cd_8020D
-##==============================================================================
-## Initial run to get the dict of dicts for each sampling type containing CV, BT and metadata DFs 
-mmDD = {}
-for k, v in paramD.items():
-    scores_cd_8020D = MultModelsCl(**paramD[k]
-                        , tts_split_type = tts_split_cd_8020
-                        , skf_cv = skf_cv
-                        , blind_test_df = X_bts
-                        , blind_test_target = y_bts
-                        , add_cm = True 
-                        , add_yn = True
-                        , return_formatted_output = True)
-    mmDD[k] = scores_cd_8020D
-
-# Extracting the dfs from within the dict and concatenating to output as one df
-for k, v in mmDD.items():
-    out_wf_cd_8020 = pd.concat(mmDD, ignore_index = True)
-
-out_wf_cd_8020f = out_wf_cd_8020.sort_values(by = ['resampling', 'source_data', 'MCC'], ascending = [True, True, False], inplace = False)
-    
-print('\n######################################################################'
-      , '\nEnd--> Successfully generated output DF for Multiple classifiers (baseline models)'
-      , '\nGene:', gene.lower()
-      , '\nDrug:', drug
-      , '\noutput file:', outFile_wf
-      , '\nDim of output:', out_wf_cd_8020f.shape
-      , '\n######################################################################')
-###############################################################################
-#====================
-# Write output file
-#====================
-out_wf_cd_8020f.to_csv(outFile_wf, index = False)
-print('\nFile successfully written:', outFile_wf)
-###############################################################################
--- a/scripts/ml/run_cd_sl.py
+++ b/scripts/ml/run_cd_sl.py
@ -1,141 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Mon Jun 20 13:05:23 2022
-
-@author: tanu
-"""
-#%%Imports ####################################################################
-import re
-import argparse
-import os, sys
-
-# gene  = 'pncA'
-# drug  = 'pyrazinamide'
-#total_mtblineage_uc = 8
-###############################################################################
-#%% command line args: case sensitive
-arg_parser = argparse.ArgumentParser()
-arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '')
-arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '') 
-args = arg_parser.parse_args()
-
-drug    = args.drug
-gene    = args.gene
-
-###############################################################################
-homedir = os.path.expanduser("~")
-sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml')
-
-###############################################################################
-#==================
-# Import data
-#==================
-from ml_data_cd_sl import *
-setvars(gene,drug)
-from ml_data_cd_sl import *
-
-# from YC run_all_ML: run locally
-#from UQ_yc_RunAllClfs import run_all_ML
-
-#====================
-# Import ML functions 
-#====================
-from MultClfs import *
-
-#==================
-# other vars
-#==================
-tts_split_cd_sl    = 'cd_sl'
-OutFile_suffix  = '_cd_sl'
-
-#==================
-# Specify outdir 
-#==================
-outdir_ml = outdir + 'ml/tts_cd_sl/'
-print('\nOutput directory:', outdir_ml)
-
-#outFile_wf = outdir_ml + gene.lower() + '_baselineC_' + OutFile_suffix + '.csv'
-outFile_wf = outdir_ml + gene.lower() + '_baselineC_noOR' + OutFile_suffix + '.csv'
-#%% Running models ############################################################
-print('\n#####################################################################\n'
-      , '\nStarting--> Running ML analysis: Baseline modes (No FS)'
-      , '\nGene name:', gene
-      , '\nDrug name:', drug
-      , '\n#####################################################################\n')
-
-paramD = {
-        'baseline_paramD': { 'input_df'        : X
-                            , 'target'         : y
-                            , 'var_type'       : 'mixed'
-                            , 'resampling_type': 'none'}
-        
-        , 'smnc_paramD': { 'input_df'          : X_smnc
-                          , 'target'           : y_smnc
-                          , 'var_type'         : 'mixed'
-                          , 'resampling_type'  : 'smnc'}
-    
-        , 'ros_paramD': { 'input_df'           : X_ros
-                        , 'target'             : y_ros
-                        , 'var_type'           : 'mixed'
-                        , 'resampling_type'    : 'ros'}
-
-        , 'rus_paramD' : { 'input_df'          : X_rus
-                          , 'target'           : y_rus
-                          , 'var_type'         : 'mixed'
-                          , 'resampling_type'  : 'rus'}
-
-        , 'rouC_paramD' : { 'input_df'         : X_rouC
-                            , 'target'          : y_rouC
-                            , 'var_type'        : 'mixed'
-                            , 'resampling_type' : 'rouC'}
-        }
-
-##==============================================================================
-## Dict with no CV BT formatted df
-## mmD = {}
-## for k, v in paramD.items():
-## #    print(mmD[k])
-##     scores_cd_slD = MultModelsCl(**paramD[k]
-##                         , tts_split_type = tts_split_cd_sl
-##                         , skf_cv = skf_cv
-##                         , blind_test_df = X_bts
-##                         , blind_test_target = y_bts
-##                         , add_cm = True 
-##                         , add_yn = True
-##                         , return_formatted_output = False)
-##     mmD[k] = scores_cd_slD
-##==============================================================================
-## Initial run to get the dict of dicts for each sampling type containing CV, BT and metadata DFs 
-mmDD = {}
-for k, v in paramD.items():
-    scores_cd_slD = MultModelsCl(**paramD[k]
-                        , tts_split_type = tts_split_cd_sl
-                        , skf_cv = skf_cv
-                        , blind_test_df = X_bts
-                        , blind_test_target = y_bts
-                        , add_cm = True 
-                        , add_yn = True
-                        , return_formatted_output = True)
-    mmDD[k] = scores_cd_slD
-
-# Extracting the dfs from within the dict and concatenating to output as one df
-for k, v in mmDD.items():
-    out_wf_cd_sl = pd.concat(mmDD, ignore_index = True)
-
-out_wf_cd_slf = out_wf_cd_sl.sort_values(by = ['resampling', 'source_data', 'MCC'], ascending = [True, True, False], inplace = False)
-    
-print('\n######################################################################'
-      , '\nEnd--> Successfully generated output DF for Multiple classifiers (baseline models)'
-      , '\nGene:', gene.lower()
-      , '\nDrug:', drug
-      , '\noutput file:', outFile_wf
-      , '\nDim of output:', out_wf_cd_slf.shape
-      , '\n######################################################################')
-###############################################################################
-#====================
-# Write output file
-#====================
-out_wf_cd_slf.to_csv(outFile_wf, index = False)
-print('\nFile successfully written:', outFile_wf)
-###############################################################################
--- a/scripts/ml/run_fg.py
+++ b/scripts/ml/run_fg.py
@ -1,557 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Sat May 28 05:25:30 2022
-
-@author: tanu
-"""
-
-import os
-import re
-import argparse
-
-###############################################################################
-# gene  = 'pncA'
-# drug  = 'pyrazinamide'
-#total_mtblineage_uc = 8
-
-#%% command line args: case sensitive
-arg_parser = argparse.ArgumentParser()
-arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '')
-arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '') 
-args = arg_parser.parse_args()
-
-drug    = args.drug
-gene    = args.gene
-###############################################################################
-homedir = os.path.expanduser("~")
-os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
-
-#==================
-# Import data
-#==================
-from ml_data_fg import *
-setvars(gene,drug)
-from ml_data_fg import *
-
-# from YC run_all_ML: run locally
-#from UQ_yc_RunAllClfs import run_all_ML
-
-#====================
-# Import ML function 
-#====================
-# TT run all ML clfs: baseline model
-from MultModelsCl import MultModelsCl
-
-############################################################################
-print('\n#####################################################################\n'
-      , '\nRunning ML analysis: feature groups '
-      , '\nGene name:', gene
-      , '\nDrug name:', drug)
-
-#==================
-# Specify outdir 
-#==================
-outdir_ml = outdir + 'ml/uq_v1/fgs/'
-print('\nOutput directory:', outdir_ml)
-outFile = outdir_ml + gene.lower() + '_baseline_FG.csv'
-
-#==================
-# other vars
-#==================
-tts_split  = 'original'
-resampling      = 'none'
-
-###############################################################################
-score_type_ordermapD = { 'mcc'      : 1
-                   , 'fscore'       : 2
-                   , 'jcc'          : 3
-                   , 'precision'    : 4
-                   , 'recall'       : 5      
-                   , 'accuracy'     : 6  
-                   , 'roc_auc'      : 7
-                   , 'TN'           : 8
-                   , 'FP'           : 9
-                   , 'FN'           : 10
-                   , 'TP'           : 11  
-                   , 'trainingY_neg': 12  
-                   , 'trainingY_pos': 13    
-                   , 'blindY_neg'   : 14
-                   , 'blindY_pos'   : 15
-                   , 'fit_time'     : 16
-                   , 'score_time'   : 17
-                   }
-#%%###########################################################################
-print('\n================================================================\n')
-
-#all_featuresN   = X_evolFN + X_structural_FN + X_genomicFN
-#    X_structural_FN =  X_stability_FN + X_affinityFN + X_resprop_FN
-#    X_resprop_FN    = X_aaindex_Fnum + X_str_Fnum + X_aap_Fcat
-     
-print('\n================================================================'
-      
-      , '\nTotal Evolutionary features (n):' , len(X_evolFN)
-      , '\n--------------Evol. feature colnames:', X_evolFN
-      
-      , '\n================================================================'
-      
-      , '\n\nTotal structural features (n):', len(X_structural_FN)
-      
-      , '\n--------Stability ncols:'                      , len(X_stability_FN)
-      , '\n--------------Common stability colnames:'      , X_common_stability_Fnum
-      , '\n--------------Foldx colnames:'                 , X_foldX_Fnum
-     
-      , '\n--------Affinity ncols:'                       , len(X_affinityFN)
-      , '\n--------------Common affinity colnames:'       , common_affinity_Fnum
-      , '\n--------------Gene specific affinity colnames:', gene_affinity_colnames
-
-      , '\n--------Residue prop ncols:'                   , len(X_resprop_FN)
-      , '\n--------------Residue Prop cols:'              , X_str_Fnum
-      , '\n--------------AA change Prop cols:'            , X_aap_Fcat
-      , '\n--------------AA index cols:'                  , X_aaindex_Fnum
-      
-      , '\n================================================================'
-      
-      , '\n\nTotal Genomic features (n):'   , len(X_genomicFN)
-      , '\n--------MAF+OR cols:'                         , len(X_gn_mafor_Fnum)
-      , '\n--------------MAF+OR colnames:'               , X_gn_mafor_Fnum
-
-      , '\n--------Lineage cols:'                        , len(X_gn_linegae_Fnum)
-      , '\n--------------Lineage cols:'                  , X_gn_linegae_Fnum
-
-      , '\n--------Other cols:'                          , len(X_gn_Fcat)
-      , '\n--------------Other cols:'                    , X_gn_Fcat
-      
-      , '\n================================================================')
-
-# Sanity check
-if ( len(X.columns) ==  len(X_evolFN) + len(X_structural_FN) + len(X_genomicFN)):
-    print('\nPass: No. of features match')
-else:
-    print('\nFail: Count of feature mismatch'
-          , '\nExpected:', len(X_evolFN) + len(X_structural_FN) + len(X_genomicFN)
-          , '\nGot:', len(X.columns))
-    sys.exit()
-
-print('\n#####################################################################\n')
-###############################################################################
-#================
-# Evolutionary
-# X_evolFN
-#================  
-feature_gp_nameEV      = 'evolutionary'
-n_featuresEV           = len(X_evolFN)
-
-scores_mmEV = MultModelsCl(input_df = X[X_evolFN]
-                    , target = y
-                    , var_type = 'mixed'
-                    , skf_cv = skf_cv
-                    , blind_test_input_df = X_bts[X_evolFN]
-                    , blind_test_target = y_bts
-                    , add_cm = True 
-                    , add_yn = True)
-
-baseline_allEV = pd.DataFrame(scores_mmEV)
-
-baseline_EV = baseline_allEV.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
-baseline_EV = baseline_EV.reset_index()
-baseline_EV.rename(columns = {'index': 'original_names'}, inplace = True)
-
-# Indicate whether BT or CT
-bt_pattern = re.compile(r'bts_.*')
-baseline_EV['data_source'] = baseline_EV.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
-
-baseline_EV['score_type'] = baseline_EV['original_names'].str.replace('bts_|test_', '', regex = True)
-
-score_type_uniqueN = set(baseline_EV['score_type'])
-cL1 = list(score_type_ordermapD.keys())
-cL2 = list(score_type_uniqueN)
-
-if set(cL1).issubset(cL2):
-    print('\nPASS: sorting df by score that is mapped onto the order I want')
-    baseline_EV['score_order'] = baseline_EV['score_type'].map(score_type_ordermapD)
-    baseline_EV.sort_values(by = ['data_source', 'score_order'], ascending = [True, True], inplace = True)
-else:
-    sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
-    
-baseline_EV['feature_group'] = feature_gp_nameEV
-baseline_EV['resampling']    = resampling
-baseline_EV['tts_split']     = tts_split
-baseline_EV['n_features']    = n_featuresEV
-###############################################################################
-#================
-# Genomics
-# X_genomicFN
-#================
-feature_gp_nameGN      = 'genomics'
-n_featuresGN           = len(X_genomicFN)
-
-scores_mmGN = MultModelsCl(input_df = X[X_genomicFN]
-                    , target = y
-                    , var_type = 'mixed'
-                    , skf_cv = skf_cv
-                    , blind_test_input_df = X_bts[X_genomicFN]
-                    , blind_test_target = y_bts
-                    , add_cm = True 
-                    , add_yn = True)
-
-baseline_allGN = pd.DataFrame(scores_mmGN)
-
-baseline_GN = baseline_allGN.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
-baseline_GN = baseline_GN.reset_index()
-baseline_GN.rename(columns = {'index': 'original_names'}, inplace = True)
-
-# Indicate whether BT or CT
-bt_pattern = re.compile(r'bts_.*')
-baseline_GN['data_source'] = baseline_GN.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
-
-baseline_GN['score_type'] = baseline_GN['original_names'].str.replace('bts_|test_', '', regex = True)
-
-score_type_uniqueN = set(baseline_GN['score_type'])
-cL1 = list(score_type_ordermapD.keys())
-cL2 = list(score_type_uniqueN)
-
-if set(cL1).issubset(cL2):
-    print('\nPASS: sorting df by score that is mapped onto the order I want')
-    baseline_GN['score_order'] = baseline_GN['score_type'].map(score_type_ordermapD)
-    baseline_GN.sort_values(by = ['data_source', 'score_order'], ascending = [True, True], inplace = True)
-else:
-    sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
-    
-baseline_GN['feature_group'] = feature_gp_nameGN
-baseline_GN['resampling'] = resampling
-baseline_GN['tts_split']     = tts_split
-baseline_GN['n_features']    = n_featuresGN
-###############################################################################
-#all_featuresN   = X_evolFN + X_structural_FN + X_genomicFN
-#    X_structural_FN =  X_stability_FN + X_affinityFN + X_resprop_FN
-#    X_resprop_FN    = X_aaindex_Fnum + X_str_Fnum + X_aap_Fcat
-#================
-# Structural cols
-# X_structural_FN
-#================
-feature_gp_nameSTR      = 'structural'
-n_featuresSTR           = len(X_structural_FN)
-
-scores_mmSTR = MultModelsCl(input_df = X[X_structural_FN]
-                    , target = y
-                    , var_type = 'mixed'
-                    , skf_cv = skf_cv
-                    , blind_test_input_df = X_bts[X_structural_FN]
-                    , blind_test_target = y_bts
-                    , add_cm = True 
-                    , add_yn = True)
-
-baseline_allSTR = pd.DataFrame(scores_mmSTR)
-
-baseline_STR = baseline_allSTR.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
-baseline_STR = baseline_STR.reset_index()
-baseline_STR.rename(columns = {'index': 'original_names'}, inplace = True)
-
-# Indicate whether BT or CT
-bt_pattern = re.compile(r'bts_.*')
-baseline_STR['data_source'] = baseline_STR.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
-
-baseline_STR['score_type'] = baseline_STR['original_names'].str.replace('bts_|test_', '', regex = True)
-
-score_type_uniqueN = set(baseline_STR['score_type'])
-cL1 = list(score_type_ordermapD.keys())
-cL2 = list(score_type_uniqueN)
-
-if set(cL1).issubset(cL2):
-    print('\nPASS: sorting df by score that is mapped onto the order I want')
-    baseline_STR['score_order'] = baseline_STR['score_type'].map(score_type_ordermapD)
-    baseline_STR.sort_values(by = ['data_source', 'score_order'], ascending = [True, True], inplace = True)
-else:
-    sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
-    
-baseline_STR['feature_group'] = feature_gp_nameSTR
-baseline_STR['resampling'] = resampling
-baseline_STR['tts_split']     = tts_split
-baseline_STR['n_features']    = n_featuresSTR
-##############################################################################
-#================
-# Stability cols
-# X_stability_FN
-#================  
-feature_gp_nameSTB      = 'stability'
-n_featuresSTB           = len(X_stability_FN)
-
-scores_mmSTB = MultModelsCl(input_df = X[X_stability_FN]
-                    , target = y
-                    , var_type = 'mixed'
-                    , skf_cv = skf_cv
-                    , blind_test_input_df = X_bts[X_stability_FN]
-                    , blind_test_target = y_bts
-                    , add_cm = True 
-                    , add_yn = True)
-
-baseline_allSTB = pd.DataFrame(scores_mmSTB)
-
-baseline_STB = baseline_allSTB.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
-baseline_STB = baseline_STB.reset_index()
-baseline_STB.rename(columns = {'index': 'original_names'}, inplace = True)
-
-# Indicate whether BT or CT
-bt_pattern = re.compile(r'bts_.*')
-baseline_STB['data_source'] = baseline_STB.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
-
-baseline_STB['score_type'] = baseline_STB['original_names'].str.replace('bts_|test_', '', regex = True)
-
-score_type_uniqueN = set(baseline_STB['score_type'])
-cL1 = list(score_type_ordermapD.keys())
-cL2 = list(score_type_uniqueN)
-
-if set(cL1).issubset(cL2):
-    print('\nPASS: sorting df by score that is mapped onto the order I want')
-    baseline_STB['score_order'] = baseline_STB['score_type'].map(score_type_ordermapD)
-    baseline_STB.sort_values(by = ['data_source', 'score_order'], ascending = [True, True], inplace = True)
-else:
-    sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
-    
-baseline_STB['feature_group'] = feature_gp_nameSTB
-baseline_STB['resampling'] = resampling
-baseline_STB['tts_split']     = tts_split
-baseline_STB['n_features']    = n_featuresSTB
-###############################################################################
-#================
-# Affinity cols
-# X_affinityFN
-#================
-feature_gp_nameAFF      = 'affinity'
-n_featuresAFF           = len(X_affinityFN)
-
-scores_mmAFF = MultModelsCl(input_df = X[X_affinityFN]
-                    , target = y
-                    , var_type = 'mixed'
-                    , skf_cv = skf_cv
-                    , blind_test_input_df = X_bts[X_affinityFN]
-                    , blind_test_target = y_bts
-                    , add_cm = True 
-                    , add_yn = True)
-
-baseline_allAFF = pd.DataFrame(scores_mmAFF)
-
-baseline_AFF = baseline_allAFF.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
-baseline_AFF = baseline_AFF.reset_index()
-baseline_AFF.rename(columns = {'index': 'original_names'}, inplace = True)
-
-# Indicate whether BT or CT
-bt_pattern = re.compile(r'bts_.*')
-baseline_AFF['data_source'] = baseline_AFF.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
-
-baseline_AFF['score_type'] = baseline_AFF['original_names'].str.replace('bts_|test_', '', regex = True)
-
-score_type_uniqueN = set(baseline_AFF['score_type'])
-cL1 = list(score_type_ordermapD.keys())
-cL2 = list(score_type_uniqueN)
-
-if set(cL1).issubset(cL2):
-    print('\nPASS: sorting df by score that is mapped onto the order I want')
-    baseline_AFF['score_order'] = baseline_AFF['score_type'].map(score_type_ordermapD)
-    baseline_AFF.sort_values(by = ['data_source', 'score_order'], ascending = [True, True], inplace = True)
-else:
-    sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
-    
-baseline_AFF['feature_group'] = feature_gp_nameAFF
-baseline_AFF['resampling'] = resampling
-baseline_AFF['tts_split']     = tts_split
-baseline_AFF['n_features']    = n_featuresAFF
-###############################################################################
-#================
-# Residue level
-# X_resprop_FN
-#================
-feature_gp_nameRES      = 'residue_prop'
-n_featuresRES           = len(X_resprop_FN)
-
-scores_mmRES = MultModelsCl(input_df = X[X_resprop_FN]
-                    , target = y
-                    , var_type = 'mixed'
-                    , skf_cv = skf_cv
-                    , blind_test_input_df = X_bts[X_resprop_FN]
-                    , blind_test_target = y_bts
-                    , add_cm = True 
-                    , add_yn = True)
-
-baseline_allRES = pd.DataFrame(scores_mmRES)
-
-baseline_RES = baseline_allRES.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
-baseline_RES = baseline_RES.reset_index()
-baseline_RES.rename(columns = {'index': 'original_names'}, inplace = True)
-
-# Indicate whether BT or CT
-bt_pattern = re.compile(r'bts_.*')
-baseline_RES['data_source'] = baseline_RES.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
-
-baseline_RES['score_type'] = baseline_RES['original_names'].str.replace('bts_|test_', '', regex = True)
-
-score_type_uniqueN = set(baseline_RES['score_type'])
-cL1 = list(score_type_ordermapD.keys())
-cL2 = list(score_type_uniqueN)
-
-if set(cL1).issubset(cL2):
-    print('\nPASS: sorting df by score that is mapped onto the order I want')
-    baseline_RES['score_order'] = baseline_RES['score_type'].map(score_type_ordermapD)
-    baseline_RES.sort_values(by = ['data_source', 'score_order'], ascending = [True, True], inplace = True)
-else:
-    sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
-    
-baseline_RES['feature_group'] = feature_gp_nameRES
-baseline_RES['resampling'] = resampling
-baseline_RES['tts_split']     = tts_split
-baseline_RES['n_features']    = n_featuresRES
-###############################################################################
-#================
-# Residue level-AAindex
-#X_resprop_FN - X_aaindex_Fnum
-#================
-X_respropNOaaFN = list(set(X_resprop_FN) - set(X_aaindex_Fnum))
-  
-feature_gp_nameRNAA      = 'ResPropNoAA'
-n_featuresRNAA           = len(X_respropNOaaFN)
-
-scores_mmRNAA = MultModelsCl(input_df = X[X_respropNOaaFN]
-                    , target = y
-                    , var_type = 'mixed'
-                    , skf_cv = skf_cv
-                    , blind_test_input_df = X_bts[X_respropNOaaFN]
-                    , blind_test_target = y_bts
-                    , add_cm = True 
-                    , add_yn = True)
-
-baseline_allRNAA = pd.DataFrame(scores_mmRNAA)
-
-baseline_RNAA = baseline_allRNAA.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
-baseline_RNAA = baseline_RNAA.reset_index()
-baseline_RNAA.rename(columns = {'index': 'original_names'}, inplace = True)
-
-# Indicate whether BT or CT
-bt_pattern = re.compile(r'bts_.*')
-baseline_RNAA['data_source'] = baseline_RNAA.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
-
-baseline_RNAA['score_type'] = baseline_RNAA['original_names'].str.replace('bts_|test_', '', regex = True)
-
-score_type_uniqueN = set(baseline_RNAA['score_type'])
-cL1 = list(score_type_ordermapD.keys())
-cL2 = list(score_type_uniqueN)
-
-if set(cL1).issubset(cL2):
-    print('\nPASS: sorting df by score that is mapped onto the order I want')
-    baseline_RNAA['score_order'] = baseline_RNAA['score_type'].map(score_type_ordermapD)
-    baseline_RNAA.sort_values(by = ['data_source', 'score_order'], ascending = [True, True], inplace = True)
-else:
-    sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
-    
-baseline_RNAA['feature_group'] = feature_gp_nameRNAA
-baseline_RNAA['resampling'] = resampling
-baseline_RNAA['tts_split']     = tts_split
-baseline_RNAA['n_features']    = n_featuresRNAA
-###############################################################################
-#================
-# Structural cols-AAindex
-#X_structural_FN - X_aaindex_Fnum
-#================
-X_strNOaaFN = list(set(X_structural_FN) - set(X_aaindex_Fnum))
-
-feature_gp_nameSNAA      = 'StrNoAA'
-n_featuresSNAA           = len(X_strNOaaFN)
-
-scores_mmSNAA = MultModelsCl(input_df = X[X_strNOaaFN]
-                    , target = y
-                    , var_type = 'mixed'
-                    , skf_cv = skf_cv
-                    , blind_test_input_df = X_bts[X_strNOaaFN]
-                    , blind_test_target = y_bts
-                    , add_cm = True 
-                    , add_yn = True)
-
-baseline_allSNAA = pd.DataFrame(scores_mmSNAA)
-
-baseline_SNAA = baseline_allSNAA.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
-baseline_SNAA = baseline_SNAA.reset_index()
-baseline_SNAA.rename(columns = {'index': 'original_names'}, inplace = True)
-
-# Indicate whether BT or CT
-bt_pattern = re.compile(r'bts_.*')
-baseline_SNAA['data_source'] = baseline_SNAA.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
-
-baseline_SNAA['score_type'] = baseline_SNAA['original_names'].str.replace('bts_|test_', '', regex = True)
-
-score_type_uniqueN = set(baseline_SNAA['score_type'])
-cL1 = list(score_type_ordermapD.keys())
-cL2 = list(score_type_uniqueN)
-
-if set(cL1).issubset(cL2):
-    print('\nPASS: sorting df by score that is mapped onto the order I want')
-    baseline_SNAA['score_order'] = baseline_SNAA['score_type'].map(score_type_ordermapD)
-    baseline_SNAA.sort_values(by = ['data_source', 'score_order'], ascending = [True, True], inplace = True)
-else:
-    sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
-    
-baseline_SNAA['feature_group'] = feature_gp_nameSNAA
-baseline_SNAA['resampling']    = resampling
-baseline_SNAA['tts_split']     = tts_split
-baseline_SNAA['n_features']    = n_featuresSNAA
-###############################################################################
-#%% COMBINING all FG dfs
-#================
-# Combine all
-# https://stackoverflow.com/questions/39862654/pandas-concat-of-multiple-data-frames-using-only-common-columns
-#================
-dfs_combine = [baseline_EV, baseline_GN, baseline_STR, baseline_STB, baseline_AFF, baseline_RES , baseline_RNAA , baseline_SNAA]
-              
-dfs_nrows = []
-for df in dfs_combine:
-    dfs_nrows = dfs_nrows + [len(df)]
-dfs_nrows = max(dfs_nrows)
-    
-dfs_ncols = []
-for df in dfs_combine:
-    dfs_ncols = dfs_ncols + [len(df.columns)]
-dfs_ncols = max(dfs_ncols)
-           
-# dfs_ncols = []
-# dfs_ncols2 = mode(dfs_ncols.append(len(df.columns) for df in dfs_combine)
-# dfs_ncols2
-
-expected_nrows = len(dfs_combine) * dfs_nrows
-expected_ncols = dfs_ncols
-
-common_cols = list(set.intersection(*(set(df.columns) for df in dfs_combine)))
-
-if len(common_cols) == dfs_ncols :
-    combined_FG_baseline = pd.concat([df[common_cols] for df in dfs_combine], ignore_index=True)
-    fgs = combined_FG_baseline[['feature_group', 'n_features']]
-    fgs = fgs.drop_duplicates()
-    print('\nConcatenating dfs with feature groups after ML analysis:' 
-          , '\nNo. of dfs combining:', len(dfs_combine)
-          , '\nSampling type:', resampling
-          , '\nThe feature groups are:'
-          , '\n', fgs)
-    if len(combined_FG_baseline) == expected_nrows  and len(combined_FG_baseline.columns) == expected_ncols:
-        print('\nPASS:', len(dfs_combine), 'dfs successfully combined'
-              , '\nnrows in combined_df:', len(combined_FG_baseline)
-              , '\nncols in combined_df:', len(combined_FG_baseline.columns))
-    else:
-        print('\nFAIL: concatenating failed'
-              , '\nExpected nrows:', expected_nrows
-              , '\nGot:', len(combined_FG_baseline)
-              , '\nExpected ncols:', expected_ncols
-              , '\nGot:', len(combined_FG_baseline.columns))
-        sys.exit()
-else:
-    sys.exit('\nConcatenting dfs not possible,check numbers ')
-        
-# # rpow bind 
-# if all(ll((baseline_EV.columns == baseline_GN.columns == baseline_STR.columns)):
-#     print('\nPASS:colnames match, proceeding to rowbind')
-#     comb_df = pd.concat()], axis = 0, ignore_index = True ) 
-###############################################################################
-#====================
-# Write output file
-#====================
-    
-combined_FG_baseline.to_csv(outFile, index = False)
-print('\nFile successfully written:', outFile)
-###############################################################################
--- a/scripts/ml/run_sl.py
+++ b/scripts/ml/run_sl.py
@ -1,141 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Mon Jun 20 13:05:23 2022
-
-@author: tanu
-"""
-#%%Imports ####################################################################
-import re
-import argparse
-import os, sys
-
-# gene  = 'pncA'
-# drug  = 'pyrazinamide'
-#total_mtblineage_uc = 8
-###############################################################################
-#%% command line args: case sensitive
-arg_parser = argparse.ArgumentParser()
-arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '')
-arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '') 
-args = arg_parser.parse_args()
-
-drug    = args.drug
-gene    = args.gene
-
-###############################################################################
-homedir = os.path.expanduser("~")
-sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml')
-
-###############################################################################
-#==================
-# Import data
-#==================
-from ml_data_sl import *
-setvars(gene,drug)
-from ml_data_sl import *
-
-# from YC run_all_ML: run locally
-#from UQ_yc_RunAllClfs import run_all_ML
-
-#====================
-# Import ML functions 
-#====================
-from MultClfs import *
-
-#==================
-# other vars
-#==================
-tts_split_sl    = 'sl'
-OutFile_suffix  = 'sl'
-
-#==================
-# Specify outdir 
-#==================
-outdir_ml = outdir + 'ml/tts_sl/'
-print('\nOutput directory:', outdir_ml)
-
-#outFile_wf = outdir_ml + gene.lower() + '_baselineC_' + OutFile_suffix + '.csv'
-outFile_wf = outdir_ml + gene.lower() + '_baselineC_noOR' + OutFile_suffix + '.csv'
-#%% Running models ############################################################
-print('\n#####################################################################\n'
-      , '\nStarting--> Running ML analysis: Baseline modes (No FS)'
-      , '\nGene name:', gene
-      , '\nDrug name:', drug
-      , '\n#####################################################################\n')
-
-paramD = {
-        'baseline_paramD': { 'input_df'        : X
-                            , 'target'         : y
-                            , 'var_type'       : 'mixed'
-                            , 'resampling_type': 'none'}
-        
-        , 'smnc_paramD': { 'input_df'          : X_smnc
-                          , 'target'           : y_smnc
-                          , 'var_type'         : 'mixed'
-                          , 'resampling_type'  : 'smnc'}
-    
-        , 'ros_paramD': { 'input_df'           : X_ros
-                        , 'target'             : y_ros
-                        , 'var_type'           : 'mixed'
-                        , 'resampling_type'    : 'ros'}
-
-        , 'rus_paramD' : { 'input_df'          : X_rus
-                          , 'target'           : y_rus
-                          , 'var_type'         : 'mixed'
-                          , 'resampling_type'  : 'rus'}
-
-        , 'rouC_paramD' : { 'input_df'         : X_rouC
-                            , 'target'          : y_rouC
-                            , 'var_type'        : 'mixed'
-                            , 'resampling_type' : 'rouC'}
-        }
-
-##==============================================================================
-## Dict with no CV BT formatted df
-## mmD = {}
-## for k, v in paramD.items():
-## #    print(mmD[k])
-##     scores_slD = MultModelsCl(**paramD[k]
-##                         , tts_split_type = tts_split_sl
-##                         , skf_cv = skf_cv
-##                         , blind_test_df = X_bts
-##                         , blind_test_target = y_bts
-##                         , add_cm = True 
-##                         , add_yn = True
-##                         , return_formatted_output = False)
-##     mmD[k] = scores_slD
-##==============================================================================
-## Initial run to get the dict of dicts for each sampling type containing CV, BT and metadata DFs 
-mmDD = {}
-for k, v in paramD.items():
-    scores_slD = MultModelsCl(**paramD[k]
-                        , tts_split_type = tts_split_sl
-                        , skf_cv = skf_cv
-                        , blind_test_df = X_bts
-                        , blind_test_target = y_bts
-                        , add_cm = True 
-                        , add_yn = True
-                        , return_formatted_output = True)
-    mmDD[k] = scores_slD
-
-# Extracting the dfs from within the dict and concatenating to output as one df
-for k, v in mmDD.items():
-    out_wf_sl = pd.concat(mmDD, ignore_index = True)
-
-out_wf_slf = out_wf_sl.sort_values(by = ['resampling', 'source_data', 'MCC'], ascending = [True, True, False], inplace = False)
-    
-print('\n######################################################################'
-      , '\nEnd--> Successfully generated output DF for Multiple classifiers (baseline models)'
-      , '\nGene:', gene.lower()
-      , '\nDrug:', drug
-      , '\noutput file:', outFile_wf
-      , '\nDim of output:', out_wf_slf.shape
-      , '\n######################################################################')
-###############################################################################
-#====================
-# Write output file
-#====================
-out_wf_slf.to_csv(outFile_wf, index = False)
-print('\nFile successfully written:', outFile_wf)
-###############################################################################
--- a/scripts/ml/running_ml_scripts.txt
+++ b/scripts/ml/running_ml_scripts.txt
@ -1,158 +1,83 @@
 ########################################################################

-                                  #70/30
+#                          70/30 [WITHOUT OR]

 ########################################################################
-=-----------------------------------=
-# All features including AA index
-#                     [WITH OR]
-=-----------------------------------=
-time ./run_7030.py -g pncA -d pyrazinamide 2>&1 | tee log_pnca_7030.txt #d
-time ./run_7030.py -g embB -d ethambutol 2>&1 | tee log_embb_7030.txt
-time ./run_7030.py -g katG -d isoniazid 2>&1 | tee log_katg_7030.txt
-time ./run_7030.py -g rpoB -d rifampicin 2>&1 | tee log_rpob_7030.txt
-time ./run_7030.py -g gid -d streptomycin 2>&1 | tee log_gid_7030.txt
-time ./run_7030.py -g alr -d cycloserine 2>&1 | tee log_alr_7030.txt
-
-# alr: # ERROR, as expected, too few values!
-# gid: problems

 =-----------------------------------=
-# All features including AA index
-#                      [WITHOUT OR] **DONE
+# actual data
 #------------------------------------=
-time ./run_7030.py -g pncA -d pyrazinamide 2>&1 | tee log_pnca_7030_noOR.txt
-time ./run_7030.py -g embB -d ethambutol 2>&1 | tee log_embb_7030_noOR.txt
-time ./run_7030.py -g katG -d isoniazid 2>&1 | tee log_katg_7030_noOR.txt
-time ./run_7030.py -g rpoB -d rifampicin 2>&1 | tee log_rpob_7030_noOR.txt
-time ./run_7030.py -g gid -d streptomycin 2>&1 | tee log_gid_7030_noOR.txt
-time ./run_7030.py -g alr -d cycloserine 2>&1 | tee log_alr_7030_noOR.txt
-########################################################################

-                                   # 80/20
-
-########################################################################
+time ./run_7030.py -g pncA -d pyrazinamide 2>&1 | tee log_pnca_7030_.txt
+time ./run_7030.py -g embB -d ethambutol 2>&1 | tee log_embb_7030_.txt
+time ./run_7030.py -g katG -d isoniazid 2>&1 | tee log_katg_7030_.txt
+time ./run_7030.py -g rpoB -d rifampicin 2>&1 | tee log_rpob_7030_.txt
+time ./run_7030.py -g gid -d streptomycin 2>&1 | tee log_gid_7030_.txt
+time ./run_7030.py -g alr -d cycloserine 2>&1 | tee log_alr_7030_.txt

 =-----------------------------------=
-# All features including AA index
-#                     [WITH OR]
-=-----------------------------------=
-time ./run_8020.py -g pncA -d pyrazinamide 2>&1 | tee log_pnca_8020.txt
-time ./run_8020.py -g embB -d ethambutol 2>&1 | tee log_embb_8020.txt
-time ./run_8020.py -g katG -d isoniazid 2>&1 | tee log_katg_8020.txt
-time ./run_8020.py -g rpoB -d rifampicin 2>&1 | tee log_rpob_8020.txt
-time ./run_8020.py -g gid -d streptomycin 2>&1 | tee log_gid_8020.txt
-time ./run_8020.py -g alr -d cycloserine 2>&1 | tee log_alr_8020.txt
-
-
-=-----------------------------------=
-# All features including AA index
-#                      [WITHOUT OR] **DONE
-real	0m1.099s
-user	0m1.308s
-sys	0m1.474s
-=-----------------------------------=
-time ./run_8020.py -g pncA -d pyrazinamide 2>&1 | tee log_pnca_8020_noOR.txt
-time ./run_8020.py -g embB -d ethambutol 2>&1 | tee log_embb_8020_noOR.txt
-time ./run_8020.py -g katG -d isoniazid 2>&1 | tee log_katg_8020_noOR.txt
-time ./run_8020.py -g rpoB -d rifampicin 2>&1 | tee log_rpob_8020_noOR.txt
-time ./run_8020.py -g gid -d streptomycin 2>&1 | tee log_gid_8020_noOR.txt
-time ./run_8020.py -g alr -d cycloserine 2>&1 | tee log_alr_8020_noOR.txt
-
-########################################################################
-
-                                   # SL
-
-########################################################################
-
-=-----------------------------------=
-# All features including AA index
-=-----------------------------------=
-
-
-
-
-
-
-=-----------------------------------=
-# All features including AA index
-#                      [WITHOUT OR]
-=-----------------------------------=
-time ./run_sl.py -g pncA -d pyrazinamide 2>&1 | tee log_pnca_sl_noOR.txt
-time ./run_sl.py -g embB -d ethambutol 2>&1 | tee log_embb_sl_noOR.txt
-time ./run_sl.py -g katG -d isoniazid 2>&1 | tee log_katg_sl_noOR.txt
-time ./run_sl.py -g rpoB -d rifampicin 2>&1 | tee log_rpob_sl_noOR.txt
-time ./run_sl.py -g gid -d streptomycin 2>&1 | tee log_gid_sl_noOR.txt
-time ./run_sl.py -g alr -d cycloserine 2>&1 | tee log_alr_sl_noOR.txt
-
-########################################################################
-
-
-########################################################################
-########################################################################
-######################   COMPLETE DATA    ##############################
-########################################################################
-########################################################################
-
-
-########################################################################
-
-                                  #70/30
-
-########################################################################
-
-
-=-----------------------------------=
-# All features including AA index
-#                      [WITHOUT OR] 
+# COMPLETE data
 #------------------------------------=
-time ./run_cd_7030.py -g pncA -d pyrazinamide 2>&1 | tee log_pnca_cd_7030_noOR.txt
-time ./run_cd_7030.py -g embB -d ethambutol 2>&1 | tee log_embb_cd_7030_noOR.txt
-time ./run_cd_7030.py -g katG -d isoniazid 2>&1 | tee log_katg_cd_7030_noOR.txt
-time ./run_cd_7030.py -g rpoB -d rifampicin 2>&1 | tee log_rpob_cd_7030_noOR.txt
-time ./run_cd_7030.py -g gid -d streptomycin 2>&1 | tee log_gid_cd_7030_noOR.txt
-time ./run_cd_7030.py -g alr -d cycloserine 2>&1 | tee log_alr_cd_7030_noOR.txt
+
+time ./run_cd_7030.py -g pncA -d pyrazinamide 2>&1 | tee log_pnca_cd_7030_.txt
+time ./run_cd_7030.py -g embB -d ethambutol 2>&1 | tee log_embb_cd_7030_.txt
+time ./run_cd_7030.py -g katG -d isoniazid 2>&1 | tee log_katg_cd_7030_.txt
+time ./run_cd_7030.py -g rpoB -d rifampicin 2>&1 | tee log_rpob_cd_7030_.txt
+time ./run_cd_7030.py -g gid -d streptomycin 2>&1 | tee log_gid_cd_7030_.txt
+time ./run_cd_7030.py -g alr -d cycloserine 2>&1 | tee log_alr_cd_7030_.txt


 ########################################################################

-                                   # 80/20
+#                      80/20 [WITHOUT OR]

 ########################################################################
+=-----------------------------------=
+# actual data
+#------------------------------------=

-
+time ./run_8020.py -g pncA -d pyrazinamide 2>&1 | tee log_pnca_8020_.txt
+time ./run_8020.py -g embB -d ethambutol 2>&1 | tee log_embb_8020_.txt
+time ./run_8020.py -g katG -d isoniazid 2>&1 | tee log_katg_8020_.txt
+time ./run_8020.py -g rpoB -d rifampicin 2>&1 | tee log_rpob_8020_.txt
+time ./run_8020.py -g gid -d streptomycin 2>&1 | tee log_gid_8020_.txt
+time ./run_8020.py -g alr -d cycloserine 2>&1 | tee log_alr_8020_.txt

 =-----------------------------------=
-# All features including AA index
-#                      [WITHOUT OR]
+# COMPLETE data
 #------------------------------------=
-time ./run_cd_8020.py -g pncA -d pyrazinamide 2>&1 | tee log_pnca_cd_8020_noOR.txt
-time ./run_cd_8020.py -g embB -d ethambutol 2>&1 | tee log_embb_cd_8020_noOR.txt
-time ./run_cd_8020.py -g katG -d isoniazid 2>&1 | tee log_katg_cd_8020_noOR.txt
-time ./run_cd_8020.py -g rpoB -d rifampicin 2>&1 | tee log_rpob_cd_8020_noOR.txt
-time ./run_cd_8020.py -g gid -d streptomycin 2>&1 | tee log_gid_cd_8020_noOR.txt
-time ./run_cd_8020.py -g alr -d cycloserine 2>&1 | tee log_alr_cd_8020_noOR.txt

+time ./run_cd_8020.py -g pncA -d pyrazinamide 2>&1 | tee log_pnca_cd_8020_.txt
+time ./run_cd_8020.py -g embB -d ethambutol 2>&1 | tee log_embb_cd_8020_.txt
+time ./run_cd_8020.py -g katG -d isoniazid 2>&1 | tee log_katg_cd_8020_.txt
+time ./run_cd_8020.py -g rpoB -d rifampicin 2>&1 | tee log_rpob_cd_8020_.txt
+time ./run_cd_8020.py -g gid -d streptomycin 2>&1 | tee log_gid_cd_8020_.txt
+time ./run_cd_8020.py -g alr -d cycloserine 2>&1 | tee log_alr_cd_8020_.txt
 ########################################################################

-                                   # SL
+#                          SL [WITHOUT OR]

 ########################################################################

-
-
 =-----------------------------------=
-# All features including AA index
-#                      [WITHOUT OR]
+# actual data
 #------------------------------------=
-time ./run_cd_sl.py -g pncA -d pyrazinamide 2>&1 | tee log_pnca_cd_sl_noOR.txt
-time ./run_cd_sl.py -g embB -d ethambutol 2>&1 | tee log_embb_cd_sl_noOR.txt
-time ./run_cd_sl.py -g katG -d isoniazid 2>&1 | tee log_katg_cd_sl_noOR.txt
-time ./run_cd_sl.py -g rpoB -d rifampicin 2>&1 | tee log_rpob_cd_sl_noOR.txt
-time ./run_cd_sl.py -g gid -d streptomycin 2>&1 | tee log_gid_cd_sl_noOR.txt
-time ./run_cd_sl.py -g alr -d cycloserine 2>&1 | tee log_alr_cd_sl_noOR.txt
-
+time ./run_sl.py -g pncA -d pyrazinamide 2>&1 | tee log_pnca_sl_.txt
+time ./run_sl.py -g embB -d ethambutol 2>&1 | tee log_embb_sl_.txt
+time ./run_sl.py -g katG -d isoniazid 2>&1 | tee log_katg_sl_.txt
+time ./run_sl.py -g rpoB -d rifampicin 2>&1 | tee log_rpob_sl_.txt
+time ./run_sl.py -g gid -d streptomycin 2>&1 | tee log_gid_sl_.txt
+time ./run_sl.py -g alr -d cycloserine 2>&1 | tee log_alr_sl_.txt

+=-----------------------------------=
+# COMPLETE data
+#------------------------------------=
+time ./run_cd_sl.py -g pncA -d pyrazinamide 2>&1 | tee log_pnca_cd_sl_.txt
+time ./run_cd_sl.py -g embB -d ethambutol 2>&1 | tee log_embb_cd_sl_.txt
+time ./run_cd_sl.py -g katG -d isoniazid 2>&1 | tee log_katg_cd_sl_.txt
+time ./run_cd_sl.py -g rpoB -d rifampicin 2>&1 | tee log_rpob_cd_sl_.txt
+time ./run_cd_sl.py -g gid -d streptomycin 2>&1 | tee log_gid_cd_sl_.txt
+time ./run_cd_sl.py -g alr -d cycloserine 2>&1 | tee log_alr_cd_sl_.txt


 ########################################################################
@ -167,4 +92,4 @@ time ./run_cd_sl.py -g alr -d cycloserine 2>&1 | tee log_alr_cd_sl_noOR.txt
 time ./run_FS.py -g pncA -d pyrazinamide 2>&1 | tee log_FS_pnca_7030.txt


-time ./run_FS_7030.py -g pncA -d pyrazinamide 2>&1 | tee log_FS_pnca_7030_noOR.txt
+time ./run_FS_7030.py -g pncA -d pyrazinamide 2>&1 | tee log_FS_pnca_7030_.txt
--- a/scripts/ml/scrMult_CALL.py
+++ b/scripts/ml/scrMult_CALL.py
@ -1,116 +0,0 @@
-fs_test = RFECV(DecisionTreeClassifier(**rs) 
-             , cv =  StratifiedKFold(n_splits = 10, shuffle = True,**rs)
-             , scoring = 'matthews_corrcoef')
-
-models = [('Logistic Regression'       , LogisticRegression(**rs) )]
-          #, ('Logistic RegressionCV'     , LogisticRegressionCV(**rs) )]
-
-
-for m in models:
-    print(m)
-print('\n================================================================\n')
-
-index = 1
-for model_name, model_fn in models:
-    print('\nRunning classifier:', index
-          , '\nModel_name:'               , model_name
-          , '\nModel func:'               , model_fn)
-          #, '\nList of models:', models)
-    index = index+1
-
-fs2 = RFECV(model_fn
-            , cv = skf_cv
-            , scoring = 'matthews_corrcoef')
-
-from sklearn.datasets import make_friedman1
-from sklearn.datasets import load_iris
-
-X_eg, y_eg = load_iris(return_X_y=True)
-#X_eg, y_eg = make_friedman1(n_samples=50, n_features=10, random_state=0)
-fs2.fit(X_eg,y_eg)
-fs2.support_
-fs2.ranking_
-###############################################################################
-# LR
-
-a_fs = fsgs(input_df = X
-     , target = y
-     #, param_gridLd = [{'fs__min_features_to_select' : []}]
-     , blind_test_df = X_bts
-     , blind_test_target = y_bts
-     #, estimator = RandomForestClassifier(**rs, **njobs, bootstrap = True, oob_score = True)
-     , estimator = LogisticRegression(**rs)
-     , use_fs = False # set True to use DT as a RFECV estimator
-     , var_type = 'mixed')
-
-a_fs.keys()
-a_fsDF  = pd.DataFrame(a_fs.items()) # LR
-a_fsDF2 = pd.DataFrame(a_fs2.items()) # use_FS= True
-a_fsDF3 = pd.DataFrame(a_fs3.items()) # RF
-
-# this one
-a_fs0 = fsgs(input_df = X
-         , target = y
-         , param_gridLd = [{'fs__min_features_to_select' : [1]}]
-         , blind_test_df = X_bts
-         , blind_test_target = y_bts
-         , estimator = LogisticRegression(**rs)
-         , use_fs = False # uses estimator as the RFECV parameter for fs. Set to TRUE if you want to supply custom_fs as shown below
-         , custom_fs = RFECV(DecisionTreeClassifier(**rs) , cv =  skf_cv, scoring = 'matthews_corrcoef')
-         , cv_method =  skf_cv
-         , var_type = 'mixed'
-         )
-###############################################
-##############################################################################
-# my function CALL
-#import fsgs from UQ_FS_fn
-
-# RFECV by default uses the estimator provided, custom option to provide fs model using use_fs and 
-a_fs = fsgs(input_df = X
-         , target = y
-         , param_gridLd = [{'fs__min_features_to_select' : [1]}]
-         , blind_test_df = X_bts
-         , blind_test_target = y_bts
-         , estimator = LogisticRegression(**rs)
-         #, use_fs = False # uses estimator as the RFECV parameter for fs. Set to TRUE if you want to supply custom_fs as shown below
-         , use_fs = True, custom_fs = RFECV(DecisionTreeClassifier(**rs) , cv =  skf_cv, scoring = 'matthews_corrcoef')
-         , cv_method =  skf_cv
-         , var_type = 'mixed'
-         )
-
-a_fs.keys()
-a_fs2.keys()
-a_fs3.keys()
-
-
-a_fsDF = pd.DataFrame(a_fs.items()) # LR
-a_fsDF.columns = ['parameter', 'param_value']
-
-a_fs2DF2 = pd.DataFrame(a_fs2.items()) # use_FS= True
-a_fs2DF2.columns = ['parameter', 'param_value']
-
-a_fsDF3 = pd.DataFrame(a_fs3.items()) # RF
-
-##############
-a_mask = a_fs['fs_res_array']
-a_fsDF.loc[a_fsDF['parameter'] == 'fs_res_array']
-
-mod_selF = a_fs2DF2.loc[a_fsDF['parameter'] == 'sel_features_names']; mod_selF
-mod_selFT = mod_selF.T
-
-# subset keys
-#keys_to_extract = ['model_name', 'fs_method', 'sel_features_names', 'all_feature_names', 'fs_res_array']
-keys_to_extract = ['fs_method', 'sel_features_names']
-a_subset = {key: a_fs2[key] for key in keys_to_extract}
-a_subsetDF =  pd.DataFrame(a_subset); a_subsetDF
-
-mod_fs_method = a_fs2['fs_method']
-fs_name = re.search('estimator=(\w+)',mod_fs_method)
-fs_namefN = fs_namef.group(1)
-print('\nFS method:', fs_namefN)
-
-fsDF = a_subsetDF[['sel_features_names']];fsDF
-fsDF.columns = [fs_namefN+'_FS']
-fsDF.columns; fsDF
-###############################
-
--- a/scripts/ml/test_MultClfs.py
+++ b/scripts/ml/test_MultClfs.py
@ -1,117 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Fri Jun 24 11:07:05 2022
-
-@author: tanu
-"""
-import re
-import argparse
-import os, sys
-###############################################################################
-# gene  = 'pncA'
-# drug  = 'pyrazinamide'
-#total_mtblineage_uc = 8
-
-# #%% command line args: case sensitive
-# arg_parser = argparse.ArgumentParser()
-# arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '')
-# arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '') 
-# args = arg_parser.parse_args()
-
-# drug    = args.drug
-# gene    = args.gene
-
-###############################################################################
-homedir = os.path.expanduser("~")
-sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml')
-
-###############################################################################
-#==================
-# Import data
-#==================
-from ml_data_7030 import *
-setvars(gene,drug)
-from ml_data_7030 import *
-
-# from YC run_all_ML: run locally
-#from UQ_yc_RunAllClfs import run_all_ML
-
-#====================
-# Import ML functions 
-#====================
-from MultClfs import *
-
-#==================
-# other vars
-#==================
-tts_split_7030    = '70_30'
-OutFile_suffix  = '7030'
-#==================
-# Specify outdir 
-#==================
-outdir_ml = outdir + 'ml/tts_7030/'
-print('\nOutput directory:', outdir_ml)
-
-#outFile_wf = outdir_ml + gene.lower() + '_baselineC_' + OutFile_suffix + '.csv'
-#outFile_lf = outdir_ml + gene.lower() + '_baselineC_ext_' + OutFile_suffix + '.csv'
-
-###############################################################################
-print('\n#####################################################################\n'
-      , '\nRunning ML analysis: Multiple models'
-      , '\nGene name:', gene
-      , '\nDrug name:', drug)
-
-###############################################################################
-#%% Test MultModelsCL WITHOUT returning formatted output
-#================
-# MultModelsCl: without formatted output
-#================
-mmD = MultModelsCl_noBT(input_df = X_smnc
-                    , target = y_smnc
-                    , var_type = 'mixed'
-                    , tts_split_type = tts_split_7030
-                    , resampling_type = 'smnc'
-                    , skf_cv = skf_cv
-                    , blind_test_df = X_bts
-                    , blind_test_target = y_bts
-                    , add_cm = True 
-                    , add_yn = True
-                    , run_blind_test = True
-                    , return_formatted_output = False)
-
-#================
-# MultModelsCl: WITH formatted output
-#================
-mmDF3 = MultModelsCl_noBT(input_df = X_smnc
-                    , target = y_smnc
-                    , var_type = 'mixed'
-                    , tts_split_type = tts_split_7030
-                    , resampling_type = 'smnc'
-                    , skf_cv = skf_cv
-                    , blind_test_df = X_bts
-                    , blind_test_target = y_bts
-                    , add_cm = True 
-                    , add_yn = True
-                    , run_blind_test = True
-                    , return_formatted_output= True )
-
-mmDF9= MultModelsCl_noBT(input_df = X
-                    , target = y
-                    , var_type = 'mixed'
-                    , tts_split_type = tts_split_7030
-                    , resampling_type = 'none'
-                    , skf_cv = None
-                    , blind_test_df = X_bts
-                    , blind_test_target = y_bts
-                    , add_cm = True 
-                    , add_yn = True
-                    , run_blind_test = True
-                    , return_formatted_output= True )
-#=================
-# test function
-#=================
-# output from function call 
-ProcessMultModelsCl(mmD)
-ProcessMultModelsCl(testD)
-