LSHTM_analysis/scripts/ml/run_8020.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Jun 20 13:05:23 2022

@author: tanu
"""
#%%Imports ####################################################################
import re
import argparse
import os, sys

# gene  = 'pncA'
# drug  = 'pyrazinamide'
#total_mtblineage_uc = 8
###############################################################################
#%% command line args: case sensitive
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '')
arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '')
args = arg_parser.parse_args()

drug    = args.drug
gene    = args.gene

###############################################################################
homedir = os.path.expanduser("~")
sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml')

###############################################################################
#==================
# Import data
#==================
from ml_data_8020 import *
setvars(gene,drug)
from ml_data_8020 import *

# from YC run_all_ML: run locally
#from UQ_yc_RunAllClfs import run_all_ML

#====================
# Import ML functions
#====================
from MultClfs import *

#==================
# other vars
#==================
tts_split_8020    = '80_20'
OutFile_suffix  = '8020'

#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/tts_8020/'
print('\nOutput directory:', outdir_ml)

#outFile_wf = outdir_ml + gene.lower() + '_baselineC_' + OutFile_suffix + '.csv'
outFile_wf = outdir_ml + gene.lower() + '_baselineC_noOR' + OutFile_suffix + '.csv'
#%% Running models ############################################################
print('\n#####################################################################\n'
      , '\nStarting--> Running ML analysis: Baseline modes (No FS)'
      , '\nGene name:', gene
      , '\nDrug name:', drug
      , '\n#####################################################################\n')

paramD = {
        'baseline_paramD': { 'input_df'        : X
                            , 'target'         : y
                            , 'var_type'       : 'mixed'
                            , 'resampling_type': 'none'}

        , 'smnc_paramD': { 'input_df'          : X_smnc
                          , 'target'           : y_smnc
                          , 'var_type'         : 'mixed'
                          , 'resampling_type'  : 'smnc'}

        , 'ros_paramD': { 'input_df'           : X_ros
                        , 'target'             : y_ros
                        , 'var_type'           : 'mixed'
                        , 'resampling_type'    : 'ros'}

        , 'rus_paramD' : { 'input_df'          : X_rus
                          , 'target'           : y_rus
                          , 'var_type'         : 'mixed'
                          , 'resampling_type'  : 'rus'}

        , 'rouC_paramD' : { 'input_df'         : X_rouC
                            , 'target'          : y_rouC
                            , 'var_type'        : 'mixed'
                            , 'resampling_type' : 'rouC'}
        }

##==============================================================================
## Dict with no CV BT formatted df
## mmD = {}
## for k, v in paramD.items():
## #    print(mmD[k])
##     scores_8020D = MultModelsCl(**paramD[k]
##                         , tts_split_type = tts_split_8020
##                         , skf_cv = skf_cv
##                         , blind_test_df = X_bts
##                         , blind_test_target = y_bts
##                         , add_cm = True
##                         , add_yn = True
##                         , return_formatted_output = False)
##     mmD[k] = scores_8020D
##==============================================================================
## Initial run to get the dict of dicts for each sampling type containing CV, BT and metadata DFs
mmDD = {}
for k, v in paramD.items():
    scores_8020D = MultModelsCl(**paramD[k]
                        , tts_split_type = tts_split_8020
                        , skf_cv = skf_cv
                        , blind_test_df = X_bts
                        , blind_test_target = y_bts
                        , add_cm = True
                        , add_yn = True
                        , return_formatted_output = True)
    mmDD[k] = scores_8020D

# Extracting the dfs from within the dict and concatenating to output as one df
for k, v in mmDD.items():
    out_wf_8020 = pd.concat(mmDD, ignore_index = True)

out_wf_8020f = out_wf_8020.sort_values(by = ['resampling', 'source_data', 'MCC'], ascending = [True, True, False], inplace = False)

print('\n######################################################################'
      , '\nEnd--> Successfully generated output DF for Multiple classifiers (baseline models)'
      , '\nGene:', gene.lower()
      , '\nDrug:', drug
      , '\noutput file:', outFile_wf
      , '\nDim of output:', out_wf_8020f.shape
      , '\n######################################################################')
###############################################################################
#====================
# Write output file
#====================
out_wf_8020f.to_csv(outFile_wf, index = False)
print('\nFile successfully written:', outFile_wf)
###############################################################################