added all run scripts for diffferent splits

2022-06-24 20:39:50 +01:00 · 2022-06-24 20:39:50 +01:00 · 5d38cde912
commit 5d38cde912
parent e2bc384155
6 changed files with 948 additions and 0 deletions
--- a/scripts/ml/run_8020.py
+++ b/scripts/ml/run_8020.py
@ -0,0 +1,141 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Mon Jun 20 13:05:23 2022
@author: tanu
 """
 #%%Imports ####################################################################
 import re
 import argparse
 import os, sys
 # gene  = 'pncA'
 # drug  = 'pyrazinamide'
 #total_mtblineage_uc = 8
 ###############################################################################
 #%% command line args: case sensitive
 arg_parser = argparse.ArgumentParser()
 arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '')
 arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '') 
 args = arg_parser.parse_args()
 drug    = args.drug
 gene    = args.gene
 ###############################################################################
 homedir = os.path.expanduser("~")
 sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml')
 ###############################################################################
 #==================
 # Import data
 #==================
 from ml_data_8020 import *
 setvars(gene,drug)
 from ml_data_8020 import *
 # from YC run_all_ML: run locally
 #from UQ_yc_RunAllClfs import run_all_ML
 #====================
 # Import ML functions 
 #====================
 from MultClfs import *
 #==================
 # other vars
 #==================
 tts_split_8020    = '80_20'
 OutFile_suffix  = '8020'
 #==================
 # Specify outdir 
 #==================
 outdir_ml = outdir + 'ml/tts_8020/'
 print('\nOutput directory:', outdir_ml)
 #outFile_wf = outdir_ml + gene.lower() + '_baselineC_' + OutFile_suffix + '.csv'
 outFile_wf = outdir_ml + gene.lower() + '_baselineC_noOR' + OutFile_suffix + '.csv'
 #%% Running models ############################################################
 print('\n#####################################################################\n'
      , '\nStarting--> Running ML analysis: Baseline modes (No FS)'
      , '\nGene name:', gene
      , '\nDrug name:', drug
      , '\n#####################################################################\n')
 paramD = {
        'baseline_paramD': { 'input_df'        : X
                            , 'target'         : y
                            , 'var_type'       : 'mixed'
                            , 'resampling_type': 'none'}
        , 'smnc_paramD': { 'input_df'          : X_smnc
                          , 'target'           : y_smnc
                          , 'var_type'         : 'mixed'
                          , 'resampling_type'  : 'smnc'}
        , 'ros_paramD': { 'input_df'           : X_ros
                        , 'target'             : y_ros
                        , 'var_type'           : 'mixed'
                        , 'resampling_type'    : 'ros'}
        , 'rus_paramD' : { 'input_df'          : X_rus
                          , 'target'           : y_rus
                          , 'var_type'         : 'mixed'
                          , 'resampling_type'  : 'rus'}
        , 'rouC_paramD' : { 'input_df'         : X_rouC
                            , 'target'          : y_rouC
                            , 'var_type'        : 'mixed'
                            , 'resampling_type' : 'rouC'}
        }
 ##==============================================================================
 ## Dict with no CV BT formatted df
 ## mmD = {}
 ## for k, v in paramD.items():
 ## #    print(mmD[k])
 ##     scores_8020D = MultModelsCl(**paramD[k]
 ##                         , tts_split_type = tts_split_8020
 ##                         , skf_cv = skf_cv
 ##                         , blind_test_df = X_bts
 ##                         , blind_test_target = y_bts
 ##                         , add_cm = True 
 ##                         , add_yn = True
 ##                         , return_formatted_output = False)
 ##     mmD[k] = scores_8020D
 ##==============================================================================
 ## Initial run to get the dict of dicts for each sampling type containing CV, BT and metadata DFs 
 mmDD = {}
 for k, v in paramD.items():
    scores_8020D = MultModelsCl(**paramD[k]
                        , tts_split_type = tts_split_8020
                        , skf_cv = skf_cv
                        , blind_test_df = X_bts
                        , blind_test_target = y_bts
                        , add_cm = True 
                        , add_yn = True
                        , return_formatted_output = True)
    mmDD[k] = scores_8020D
 # Extracting the dfs from within the dict and concatenating to output as one df
 for k, v in mmDD.items():
    out_wf_8020 = pd.concat(mmDD, ignore_index = True)
 out_wf_8020f = out_wf_8020.sort_values(by = ['resampling', 'source_data', 'MCC'], ascending = [True, True, False], inplace = False)
 print('\n######################################################################'
      , '\nEnd--> Successfully generated output DF for Multiple classifiers (baseline models)'
      , '\nGene:', gene.lower()
      , '\nDrug:', drug
      , '\noutput file:', outFile_wf
      , '\nDim of output:', out_wf_8020f.shape
      , '\n######################################################################')
 ###############################################################################
 #====================
 # Write output file
 #====================
 out_wf_8020f.to_csv(outFile_wf, index = False)
 print('\nFile successfully written:', outFile_wf)
 ###############################################################################
--- a/scripts/ml/run_FS_7030.py
+++ b/scripts/ml/run_FS_7030.py
@ -0,0 +1,242 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Tue May 24 08:11:05 2022
@author: tanu
 """
 #%%
 import os, sys
 import pandas as pd
 import numpy as np
 import pprint as pp
 from copy import deepcopy
 from sklearn import linear_model
 from sklearn import datasets
 from collections import Counter
 from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
 from sklearn.linear_model import RidgeClassifier, RidgeClassifierCV, SGDClassifier, PassiveAggressiveClassifier
 from sklearn.naive_bayes import BernoulliNB
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.svm import SVC
 from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
 from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier
 from sklearn.naive_bayes import GaussianNB
 from sklearn.gaussian_process import GaussianProcessClassifier, kernels
 from sklearn.gaussian_process.kernels import RBF, DotProduct, Matern, RationalQuadratic, WhiteKernel
 from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
 from sklearn.neural_network import MLPClassifier
 from sklearn.svm import SVC
 from xgboost import XGBClassifier
 from sklearn.naive_bayes import MultinomialNB
 from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
 from sklearn.compose import ColumnTransformer
 from sklearn.compose import make_column_transformer
 from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score
 from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report
 # added
 from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, LeaveOneOut, KFold, RepeatedKFold, cross_val_predict
 from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
 from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold
 from sklearn.pipeline import Pipeline, make_pipeline
 from sklearn.feature_selection import RFE, RFECV
 import itertools
 import seaborn as sns
 import matplotlib.pyplot as plt
 from statistics import mean, stdev, median, mode
 from imblearn.over_sampling import RandomOverSampler
 from imblearn.under_sampling import RandomUnderSampler
 from imblearn.over_sampling import SMOTE
 from sklearn.datasets import make_classification
 from imblearn.combine import SMOTEENN
 from imblearn.combine import SMOTETomek
 from imblearn.over_sampling import SMOTENC
 from imblearn.under_sampling import EditedNearestNeighbours
 from imblearn.under_sampling import RepeatedEditedNearestNeighbours
 from sklearn.model_selection import GridSearchCV
 from sklearn.base import BaseEstimator
 from sklearn.impute import KNNImputer as KNN
 import json
 import argparse
 import re
 ###############################################################################
 #gene  = 'pncA'
 #drug  = 'pyrazinamide'
 #total_mtblineage_uc = 8
 #%% command line args: case sensitive
 arg_parser = argparse.ArgumentParser()
 arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '')
 arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '') 
 args = arg_parser.parse_args()
 drug    = args.drug
 gene    = args.gene
 ###############################################################################
 #==================
 # other vars
 #==================
 tts_split    = '70_30'
 OutFile_suffix  = '7030_FS'
 ###############################################################################
 homedir = os.path.expanduser("~")
 sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml')
 ###############################################################################
 #==================
 # Import data
 #==================
 from ml_data_7030 import *
 setvars(gene,drug)
 from ml_data_7030 import *
 # from YC run_all_ML: run locally
 #from UQ_yc_RunAllClfs import run_all_ML
 #==========================================
 # Import ML functions:
 # fsgs_rfecv(): RFECV for Feature selection
 #==========================================
 from MultClfs import *
 #==================
 # Specify outdir 
 #==================
 outdir_ml = outdir + 'ml/tts_7030/fs/'
 print('\nOutput directory:', outdir_ml)
 #OutFileFS = outdir_ml + gene.lower() + '_FS' + OutFile_suffix + '.json'
 OutFileFS = outdir_ml + gene.lower() + '_FS_noOR' + OutFile_suffix + '.json'
 ############################################################################
 ###############################################################################
 #====================
 # single model CALL
 #====================
 # aFS = fsgs(input_df = X
 #          , target = y
 #          , param_gridLd = [{'fs__min_features_to_select': [1]}]
 #          , blind_test_df = X_bts
 #          , blind_test_target = y_bts
 #          , estimator = LogisticRegression(**rs)
 #          , use_fs = False # uses estimator as the RFECV parameter for fs. Set to TRUE if you want to supply custom_fs as shown below
 #          , custom_fs = RFECV(DecisionTreeClassifier(**rs) , cv =  skf_cv, scoring = 'matthews_corrcoef')
 #          , cv_method =  skf_cv
 #          , var_type = 'mixed'
 #          )
 #############
 # Loop
 ############
 #models_fs = [('Decision Tree'             , DecisionTreeClassifier(**rs)) ]
 models_fs = [('AdaBoost Classifier'   , AdaBoostClassifier(**rs) )
          , ('Decision Tree'             , DecisionTreeClassifier(**rs) ) 
          , ('Extra Tree'                , ExtraTreeClassifier(**rs) )
          , ('Extra Trees'               , ExtraTreesClassifier(**rs) ) 
          , ('Gradient Boosting'         , GradientBoostingClassifier(**rs) )
          , ('LDA'                       , LinearDiscriminantAnalysis() )
          , ('Logistic Regression'       , LogisticRegression(**rs) )
          , ('Logistic RegressionCV'     , LogisticRegressionCV(cv = 3, **rs))
          , ('Passive Aggresive'         , PassiveAggressiveClassifier(**rs, **njobs) )
          , ('Random Forest'             , RandomForestClassifier(**rs, n_estimators = 1000 ) ) 
          , ('Random Forest2'            , RandomForestClassifier(min_samples_leaf = 5
                                                                 , n_estimators = 1000
                                                                 , bootstrap    = True
                                                                 , oob_score    = True
                                                                 , **njobs
                                                                 , **rs
                                                                 , max_features = 'auto') ) 
          , ('Ridge Classifier'          , RidgeClassifier(**rs)  )
          , ('Ridge ClassifierCV'        , RidgeClassifierCV(cv = 3) )          
          , ('Stochastic GDescent'       , SGDClassifier(**rs, **njobs) )
          ## , ('XGBoost'                   , XGBClassifier(**rs, **njobs, verbosity = 3 , use_label_encoder = False) )
          ]
 print('\n#####################################################################'
      , '\nRunning Feature Selection using classfication models_fs (n):', len(models_fs)
      , '\nGene:'  , gene.lower()
      , '\nDrug:'  , drug
      , '\nSplit:' , tts_split
      ,'\n####################################################################')
 for m in models_fs:
    print(m)
 print('\n====================================================================\n')
 out_fsD = {}
 index = 1
 for model_name, model_fn in models_fs:
    print('\nRunning classifier with FS:', index
          , '\nModel_name:'               , model_name
          , '\nModel func:'               , model_fn)
          #, '\nList of models_fs:', models_fs)
    index = index+1
    out_fsD[model_name] = fsgs_rfecv(input_df = X
              , target = y
              , param_gridLd = [{'fs__min_features_to_select': [1]}]
              , blind_test_df = X_bts
              , blind_test_target = y_bts
              , estimator = model_fn
              , use_fs = False # uses estimator as the RFECV parameter for fs. Set to TRUE if you want to supply custom_fs as shown below
              , custom_fs = RFECV(DecisionTreeClassifier(**rs) , cv =  skf_cv, scoring = 'matthews_corrcoef')
              , cv_method =  skf_cv
              , var_type = 'mixed'
              )
 out_fsD
 #%% Checking results dict    
 tot_Ditems = sum(len(v) for v in out_fsD.values())
 checkL = []
 for k, v in out_fsD.items():
    l = [len(out_fsD[k])]
    checkL = checkL + l
    n_sD = len(checkL) # no. of subDicts
    l_sD = list(set(checkL)) # length of each subDict
 print('\nTotal no.of subdicts:', n_sD)
 if len(l_sD) == 1 and tot_Ditems == n_sD*l_sD[0]:
    print('\nPASS: successful run for all Classifiers'
          , '\nLength of each subdict:', l_sD)
 print('\nSuccessfully ran Feature selection on', len(models_fs), 'classifiers'
      , '\nGene:', gene.lower()
      , '\nDrug:', drug
      , '\nSplit type:', tts_split
      , '\nTotal fs models results:', len(out_fsD)
      , '\nTotal items in output:', sum(len(v) for v in out_fsD.values()) )
 ##############################################################################
 #%% json output
 #========================================
 # Write final output file
 # https://stackoverflow.com/questions/19201290/how-to-save-a-dictionary-to-a-file
 #========================================
 # Output final dict as a json
 print('\nWriting Final output file (json):', OutFileFS)
 with open(OutFileFS, 'w') as f:
    f.write(json.dumps(out_fsD
 #                       , cls = NpEncoder
 ))
 # read json
 with open(OutFileFS, 'r') as f:data = json.load(f)
 #############################################################################
--- a/scripts/ml/run_cd_7030.py
+++ b/scripts/ml/run_cd_7030.py
@ -0,0 +1,142 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Mon Jun 20 13:05:23 2022
@author: tanu
 """
 #%%Imports ####################################################################
 import re
 import argparse
 import os, sys
 # gene  = 'pncA'
 # drug  = 'pyrazinamide'
 #total_mtblineage_uc = 8
 ###############################################################################
 #%% command line args: case sensitive
 arg_parser = argparse.ArgumentParser()
 arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '')
 arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '') 
 args = arg_parser.parse_args()
 drug    = args.drug
 gene    = args.gene
 ###############################################################################
 homedir = os.path.expanduser("~")
 sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml')
 ###############################################################################
 #==================
 # Import data
 #==================
 from ml_data_cd_7030 import *
 setvars(gene,drug)
 from ml_data_cd_7030 import *
 # from YC run_all_ML: run locally
 #from UQ_yc_RunAllClfs import run_all_ML
 #====================
 # Import ML functions 
 #====================
 from MultClfs import *
 #==================
 # other vars
 #==================
 tts_split_cd_7030    = 'cd_7030'
 OutFile_suffix  = '_cd_7030'
 #==================
 # Specify outdir 
 #==================
 outdir_ml = outdir + 'ml/tts_cd_7030/'
 print('\nOutput directory:', outdir_ml)
 #outFile_wf = outdir_ml + gene.lower() + '_baselineC_' + OutFile_suffix + '.csv'
 outFile_wf = outdir_ml + gene.lower() + '_baselineC_noOR' + OutFile_suffix + '.csv'
 #%% Running models ############################################################
 print('\n#####################################################################\n'
      , '\nStarting--> Running ML analysis: Baseline modes (No FS)'
      , '\nGene name:', gene
      , '\nDrug name:', drug
      , '\n#####################################################################\n')
 paramD = {
        'baseline_paramD': { 'input_df'        : X
                            , 'target'         : y
                            , 'var_type'       : 'mixed'
                            , 'resampling_type': 'none'}
        , 'smnc_paramD': { 'input_df'          : X_smnc
                          , 'target'           : y_smnc
                          , 'var_type'         : 'mixed'
                          , 'resampling_type'  : 'smnc'}
        , 'ros_paramD': { 'input_df'           : X_ros
                        , 'target'             : y_ros
                        , 'var_type'           : 'mixed'
                        , 'resampling_type'    : 'ros'}
        , 'rus_paramD' : { 'input_df'          : X_rus
                          , 'target'           : y_rus
                          , 'var_type'         : 'mixed'
                          , 'resampling_type'  : 'rus'}
        , 'rouC_paramD' : { 'input_df'         : X_rouC
                            , 'target'          : y_rouC
                            , 'var_type'        : 'mixed'
                            , 'resampling_type' : 'rouC'}
        }
 ##==============================================================================
 ## Dict with no CV BT formatted df
 ## mmD = {}
 ## for k, v in paramD.items():
 ## #    print(mmD[k])
 ##     scores_cd_7030D = MultModelsCl(**paramD[k]
 ##                         , tts_split_type = tts_split_cd_7030
 ##                         , skf_cv = skf_cv
 ##                         , blind_test_df = X_bts
 ##                         , blind_test_target = y_bts
 ##                         , add_cm = True 
 ##                         , add_yn = True
 ##                         , return_formatted_output = False)
 ##     mmD[k] = scores_cd_7030D
 ##==============================================================================
 ## Initial run to get the dict of dicts for each sampling type containing CV, BT and metadata DFs 
 mmDD = {}
 for k, v in paramD.items():
    scores_cd_7030D = MultModelsCl(**paramD[k]
                        , tts_split_type = tts_split_cd_7030
                        , skf_cv = skf_cv
                        , blind_test_df = X_bts
                        , blind_test_target = y_bts
                        , add_cm = True 
                        , add_yn = True
                        , return_formatted_output = True)
    mmDD[k] = scores_cd_7030D
 # Extracting the dfs from within the dict and concatenating to output as one df
 for k, v in mmDD.items():
    out_wf_cd_7030 = pd.concat(mmDD, ignore_index = True)
 out_wf_cd_7030f = out_wf_cd_7030.sort_values(by = ['resampling', 'source_data', 'MCC'], ascending = [True, True, False], inplace = False)
 print('\n######################################################################'
      , '\nEnd--> Successfully generated output DF for Multiple classifiers (baseline models)'
      , '\nGene:', gene.lower()
      , '\nDrug:', drug
      , '\noutput file:', outFile_wf
      , '\nDim of output:', out_wf_cd_7030f.shape
      , '\n######################################################################')
 ###############################################################################
 #====================
 # Write output file
 #====================
 out_wf_cd_7030f.to_csv(outFile_wf, index = False)
 print('\nFile successfully written:', outFile_wf)
 ###############################################################################
--- a/scripts/ml/run_cd_8020.py
+++ b/scripts/ml/run_cd_8020.py
@ -0,0 +1,141 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Mon Jun 20 13:05:23 2022
@author: tanu
 """
 #%%Imports ####################################################################
 import re
 import argparse
 import os, sys
 # gene  = 'pncA'
 # drug  = 'pyrazinamide'
 #total_mtblineage_uc = 8
 ###############################################################################
 #%% command line args: case sensitive
 arg_parser = argparse.ArgumentParser()
 arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '')
 arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '') 
 args = arg_parser.parse_args()
 drug    = args.drug
 gene    = args.gene
 ###############################################################################
 homedir = os.path.expanduser("~")
 sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml')
 ###############################################################################
 #==================
 # Import data
 #==================
 from ml_data_cd_8020 import *
 setvars(gene,drug)
 from ml_data_cd_8020 import *
 # from YC run_all_ML: run locally
 #from UQ_yc_RunAllClfs import run_all_ML
 #====================
 # Import ML functions 
 #====================
 from MultClfs import *
 #==================
 # other vars
 #==================
 tts_split_cd_8020    = 'cd_80_20'
 OutFile_suffix  = '_cd_8020'
 #==================
 # Specify outdir 
 #==================
 outdir_ml = outdir + 'ml/tts_cd_8020/'
 print('\nOutput directory:', outdir_ml)
 #outFile_wf = outdir_ml + gene.lower() + '_baselineC_' + OutFile_suffix + '.csv'
 outFile_wf = outdir_ml + gene.lower() + '_baselineC_noOR' + OutFile_suffix + '.csv'
 #%% Running models ############################################################
 print('\n#####################################################################\n'
      , '\nStarting--> Running ML analysis: Baseline modes (No FS)'
      , '\nGene name:', gene
      , '\nDrug name:', drug
      , '\n#####################################################################\n')
 paramD = {
        'baseline_paramD': { 'input_df'        : X
                            , 'target'         : y
                            , 'var_type'       : 'mixed'
                            , 'resampling_type': 'none'}
        , 'smnc_paramD': { 'input_df'          : X_smnc
                          , 'target'           : y_smnc
                          , 'var_type'         : 'mixed'
                          , 'resampling_type'  : 'smnc'}
        , 'ros_paramD': { 'input_df'           : X_ros
                        , 'target'             : y_ros
                        , 'var_type'           : 'mixed'
                        , 'resampling_type'    : 'ros'}
        , 'rus_paramD' : { 'input_df'          : X_rus
                          , 'target'           : y_rus
                          , 'var_type'         : 'mixed'
                          , 'resampling_type'  : 'rus'}
        , 'rouC_paramD' : { 'input_df'         : X_rouC
                            , 'target'          : y_rouC
                            , 'var_type'        : 'mixed'
                            , 'resampling_type' : 'rouC'}
        }
 ##==============================================================================
 ## Dict with no CV BT formatted df
 ## mmD = {}
 ## for k, v in paramD.items():
 ## #    print(mmD[k])
 ##     scores_cd_8020D = MultModelsCl(**paramD[k]
 ##                         , tts_split_type = tts_split_cd_8020
 ##                         , skf_cv = skf_cv
 ##                         , blind_test_df = X_bts
 ##                         , blind_test_target = y_bts
 ##                         , add_cm = True 
 ##                         , add_yn = True
 ##                         , return_formatted_output = False)
 ##     mmD[k] = scores_cd_8020D
 ##==============================================================================
 ## Initial run to get the dict of dicts for each sampling type containing CV, BT and metadata DFs 
 mmDD = {}
 for k, v in paramD.items():
    scores_cd_8020D = MultModelsCl(**paramD[k]
                        , tts_split_type = tts_split_cd_8020
                        , skf_cv = skf_cv
                        , blind_test_df = X_bts
                        , blind_test_target = y_bts
                        , add_cm = True 
                        , add_yn = True
                        , return_formatted_output = True)
    mmDD[k] = scores_cd_8020D
 # Extracting the dfs from within the dict and concatenating to output as one df
 for k, v in mmDD.items():
    out_wf_cd_8020 = pd.concat(mmDD, ignore_index = True)
 out_wf_cd_8020f = out_wf_cd_8020.sort_values(by = ['resampling', 'source_data', 'MCC'], ascending = [True, True, False], inplace = False)
 print('\n######################################################################'
      , '\nEnd--> Successfully generated output DF for Multiple classifiers (baseline models)'
      , '\nGene:', gene.lower()
      , '\nDrug:', drug
      , '\noutput file:', outFile_wf
      , '\nDim of output:', out_wf_cd_8020f.shape
      , '\n######################################################################')
 ###############################################################################
 #====================
 # Write output file
 #====================
 out_wf_cd_8020f.to_csv(outFile_wf, index = False)
 print('\nFile successfully written:', outFile_wf)
 ###############################################################################
--- a/scripts/ml/run_cd_sl.py
+++ b/scripts/ml/run_cd_sl.py
@ -0,0 +1,141 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Mon Jun 20 13:05:23 2022
@author: tanu
 """
 #%%Imports ####################################################################
 import re
 import argparse
 import os, sys
 # gene  = 'pncA'
 # drug  = 'pyrazinamide'
 #total_mtblineage_uc = 8
 ###############################################################################
 #%% command line args: case sensitive
 arg_parser = argparse.ArgumentParser()
 arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '')
 arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '') 
 args = arg_parser.parse_args()
 drug    = args.drug
 gene    = args.gene
 ###############################################################################
 homedir = os.path.expanduser("~")
 sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml')
 ###############################################################################
 #==================
 # Import data
 #==================
 from ml_data_cd_sl import *
 setvars(gene,drug)
 from ml_data_cd_sl import *
 # from YC run_all_ML: run locally
 #from UQ_yc_RunAllClfs import run_all_ML
 #====================
 # Import ML functions 
 #====================
 from MultClfs import *
 #==================
 # other vars
 #==================
 tts_split_cd_sl    = 'cd_sl'
 OutFile_suffix  = '_cd_sl'
 #==================
 # Specify outdir 
 #==================
 outdir_ml = outdir + 'ml/tts_cd_sl/'
 print('\nOutput directory:', outdir_ml)
 #outFile_wf = outdir_ml + gene.lower() + '_baselineC_' + OutFile_suffix + '.csv'
 outFile_wf = outdir_ml + gene.lower() + '_baselineC_noOR' + OutFile_suffix + '.csv'
 #%% Running models ############################################################
 print('\n#####################################################################\n'
      , '\nStarting--> Running ML analysis: Baseline modes (No FS)'
      , '\nGene name:', gene
      , '\nDrug name:', drug
      , '\n#####################################################################\n')
 paramD = {
        'baseline_paramD': { 'input_df'        : X
                            , 'target'         : y
                            , 'var_type'       : 'mixed'
                            , 'resampling_type': 'none'}
        , 'smnc_paramD': { 'input_df'          : X_smnc
                          , 'target'           : y_smnc
                          , 'var_type'         : 'mixed'
                          , 'resampling_type'  : 'smnc'}
        , 'ros_paramD': { 'input_df'           : X_ros
                        , 'target'             : y_ros
                        , 'var_type'           : 'mixed'
                        , 'resampling_type'    : 'ros'}
        , 'rus_paramD' : { 'input_df'          : X_rus
                          , 'target'           : y_rus
                          , 'var_type'         : 'mixed'
                          , 'resampling_type'  : 'rus'}
        , 'rouC_paramD' : { 'input_df'         : X_rouC
                            , 'target'          : y_rouC
                            , 'var_type'        : 'mixed'
                            , 'resampling_type' : 'rouC'}
        }
 ##==============================================================================
 ## Dict with no CV BT formatted df
 ## mmD = {}
 ## for k, v in paramD.items():
 ## #    print(mmD[k])
 ##     scores_cd_slD = MultModelsCl(**paramD[k]
 ##                         , tts_split_type = tts_split_cd_sl
 ##                         , skf_cv = skf_cv
 ##                         , blind_test_df = X_bts
 ##                         , blind_test_target = y_bts
 ##                         , add_cm = True 
 ##                         , add_yn = True
 ##                         , return_formatted_output = False)
 ##     mmD[k] = scores_cd_slD
 ##==============================================================================
 ## Initial run to get the dict of dicts for each sampling type containing CV, BT and metadata DFs 
 mmDD = {}
 for k, v in paramD.items():
    scores_cd_slD = MultModelsCl(**paramD[k]
                        , tts_split_type = tts_split_cd_sl
                        , skf_cv = skf_cv
                        , blind_test_df = X_bts
                        , blind_test_target = y_bts
                        , add_cm = True 
                        , add_yn = True
                        , return_formatted_output = True)
    mmDD[k] = scores_cd_slD
 # Extracting the dfs from within the dict and concatenating to output as one df
 for k, v in mmDD.items():
    out_wf_cd_sl = pd.concat(mmDD, ignore_index = True)
 out_wf_cd_slf = out_wf_cd_sl.sort_values(by = ['resampling', 'source_data', 'MCC'], ascending = [True, True, False], inplace = False)
 print('\n######################################################################'
      , '\nEnd--> Successfully generated output DF for Multiple classifiers (baseline models)'
      , '\nGene:', gene.lower()
      , '\nDrug:', drug
      , '\noutput file:', outFile_wf
      , '\nDim of output:', out_wf_cd_slf.shape
      , '\n######################################################################')
 ###############################################################################
 #====================
 # Write output file
 #====================
 out_wf_cd_slf.to_csv(outFile_wf, index = False)
 print('\nFile successfully written:', outFile_wf)
 ###############################################################################
--- a/scripts/ml/run_sl.py
+++ b/scripts/ml/run_sl.py
@ -0,0 +1,141 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Mon Jun 20 13:05:23 2022
@author: tanu
 """
 #%%Imports ####################################################################
 import re
 import argparse
 import os, sys
 # gene  = 'pncA'
 # drug  = 'pyrazinamide'
 #total_mtblineage_uc = 8
 ###############################################################################
 #%% command line args: case sensitive
 arg_parser = argparse.ArgumentParser()
 arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '')
 arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '') 
 args = arg_parser.parse_args()
 drug    = args.drug
 gene    = args.gene
 ###############################################################################
 homedir = os.path.expanduser("~")
 sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml')
 ###############################################################################
 #==================
 # Import data
 #==================
 from ml_data_sl import *
 setvars(gene,drug)
 from ml_data_sl import *
 # from YC run_all_ML: run locally
 #from UQ_yc_RunAllClfs import run_all_ML
 #====================
 # Import ML functions 
 #====================
 from MultClfs import *
 #==================
 # other vars
 #==================
 tts_split_sl    = 'sl'
 OutFile_suffix  = 'sl'
 #==================
 # Specify outdir 
 #==================
 outdir_ml = outdir + 'ml/tts_sl/'
 print('\nOutput directory:', outdir_ml)
 #outFile_wf = outdir_ml + gene.lower() + '_baselineC_' + OutFile_suffix + '.csv'
 outFile_wf = outdir_ml + gene.lower() + '_baselineC_noOR' + OutFile_suffix + '.csv'
 #%% Running models ############################################################
 print('\n#####################################################################\n'
      , '\nStarting--> Running ML analysis: Baseline modes (No FS)'
      , '\nGene name:', gene
      , '\nDrug name:', drug
      , '\n#####################################################################\n')
 paramD = {
        'baseline_paramD': { 'input_df'        : X
                            , 'target'         : y
                            , 'var_type'       : 'mixed'
                            , 'resampling_type': 'none'}
        , 'smnc_paramD': { 'input_df'          : X_smnc
                          , 'target'           : y_smnc
                          , 'var_type'         : 'mixed'
                          , 'resampling_type'  : 'smnc'}
        , 'ros_paramD': { 'input_df'           : X_ros
                        , 'target'             : y_ros
                        , 'var_type'           : 'mixed'
                        , 'resampling_type'    : 'ros'}
        , 'rus_paramD' : { 'input_df'          : X_rus
                          , 'target'           : y_rus
                          , 'var_type'         : 'mixed'
                          , 'resampling_type'  : 'rus'}
        , 'rouC_paramD' : { 'input_df'         : X_rouC
                            , 'target'          : y_rouC
                            , 'var_type'        : 'mixed'
                            , 'resampling_type' : 'rouC'}
        }
 ##==============================================================================
 ## Dict with no CV BT formatted df
 ## mmD = {}
 ## for k, v in paramD.items():
 ## #    print(mmD[k])
 ##     scores_slD = MultModelsCl(**paramD[k]
 ##                         , tts_split_type = tts_split_sl
 ##                         , skf_cv = skf_cv
 ##                         , blind_test_df = X_bts
 ##                         , blind_test_target = y_bts
 ##                         , add_cm = True 
 ##                         , add_yn = True
 ##                         , return_formatted_output = False)
 ##     mmD[k] = scores_slD
 ##==============================================================================
 ## Initial run to get the dict of dicts for each sampling type containing CV, BT and metadata DFs 
 mmDD = {}
 for k, v in paramD.items():
    scores_slD = MultModelsCl(**paramD[k]
                        , tts_split_type = tts_split_sl
                        , skf_cv = skf_cv
                        , blind_test_df = X_bts
                        , blind_test_target = y_bts
                        , add_cm = True 
                        , add_yn = True
                        , return_formatted_output = True)
    mmDD[k] = scores_slD
 # Extracting the dfs from within the dict and concatenating to output as one df
 for k, v in mmDD.items():
    out_wf_sl = pd.concat(mmDD, ignore_index = True)
 out_wf_slf = out_wf_sl.sort_values(by = ['resampling', 'source_data', 'MCC'], ascending = [True, True, False], inplace = False)
 print('\n######################################################################'
      , '\nEnd--> Successfully generated output DF for Multiple classifiers (baseline models)'
      , '\nGene:', gene.lower()
      , '\nDrug:', drug
      , '\noutput file:', outFile_wf
      , '\nDim of output:', out_wf_slf.shape
      , '\n######################################################################')
 ###############################################################################
 #====================
 # Write output file
 #====================
 out_wf_slf.to_csv(outFile_wf, index = False)
 print('\nFile successfully written:', outFile_wf)
 ###############################################################################