LSHTM_analysis/scripts/ml/run_7030_LOOP.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Jun 20 13:05:23 2022

@author: tanu
"""
import re
import argparse
###############################################################################
# gene  = 'pncA'
# drug  = 'pyrazinamide'
#total_mtblineage_uc = 8

#%% command line args: case sensitive
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '')
arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '')
args = arg_parser.parse_args()

drug    = args.drug
gene    = args.gene

###############################################################################

###############################################################################
#==================
# Import data
#==================
from ml_data_7030 import *
setvars(gene,drug)
from ml_data_7030 import *

# from YC run_all_ML: run locally
#from UQ_yc_RunAllClfs import run_all_ML

#====================
# Import ML function
#====================
# TT run all ML clfs: baseline model
from MultModelsCl import MultModelsCl

#==================
# other vars
#==================
tts_split_7030    = '70_30'
OutFile_suffix  = '7030'
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/tts_7030/'
print('\nOutput directory:', outdir_ml)

outFile_wf = outdir_ml + gene.lower() + '_baselineC_' + OutFile_suffix + '.csv'
outFile_lf = outdir_ml + gene.lower() + '_baselineC_ext_' + OutFile_suffix + '.csv'

###############################################################################
score_type_ordermapD = { 'mcc'      : 1
                   , 'fscore'       : 2
                   , 'jcc'          : 3
                   , 'precision'    : 4
                   , 'recall'       : 5
                   , 'accuracy'     : 6
                   , 'roc_auc'      : 7
                   , 'TN'           : 8
                   , 'FP'           : 9
                   , 'FN'           : 10
                   , 'TP'           : 11
                   , 'trainingY_neg': 12
                   , 'trainingY_pos': 13
                   , 'blindY_neg'   : 14
                   , 'blindY_pos'   : 15
                   , 'fit_time'     : 16
                   , 'score_time'   : 17
                   }

scoreCV_mapD = {'test_mcc'         : 'MCC'
                , 'test_fscore'    : 'F1'
                , 'test_precision' : 'Precision'
                , 'test_recall'    : 'Recall'
                , 'test_accuracy'  : 'Accuracy'
                , 'test_roc_auc'   : 'ROC_AUC'
                , 'test_jcc'       : 'JCC'
                }

scoreBT_mapD = {'bts_mcc'          : 'MCC'
                , 'bts_fscore'     : 'F1'
                , 'bts_precision'  : 'Precision'
                , 'bts_recall'     : 'Recall'
                , 'bts_accuracy'   : 'Accuracy'
                , 'bts_roc_auc'    : 'ROC_AUC'
                , 'bts_jcc'        : 'JCC'
               }

# # data dependent variables but NOT dependent on resampling
# bts_size  = len(X_bts)
# yc2       = Counter(y_bts)
# yc2_ratio = yc2[0]/yc2[1]

###############################################################################
print('\n#####################################################################\n'
      , '\nRunning ML analysis: feature groups '
      , '\nGene name:', gene
      , '\nDrug name:', drug)


fooD = {'baseline_paramD': {
                   'input_df': X
                   , 'target': y
                   , 'var_type': 'mixed'
                   , 'resampling_type': 'none'}
        ,
        'smnc_paramD': {'input_df': X_smnc
                   , 'target': y_smnc
                   , 'var_type': 'mixed'
                   , 'resampling_type': 'smnc'}
}

barD = {}
for k, v in fooD.items():
    #print(k)
    print(fooD[k])
    scores_7030D = MultModelsCl(**fooD[k]
                        , tts_split_type = tts_split_7030
                        , skf_cv = skf_cv
                        , blind_test_df = X_bts
                        , blind_test_target = y_bts
                        , add_cm = True
                        , add_yn = True)
    barD[k] = scores_7030D


ros_paramD = {input_df = X_ros
                   , target = y_ros
                   , var_type = 'mixed'
                   , resampling_type = 'smnc'}


rus_paramD = {input_df = X_rus
                   , target = y_rus
                   , var_type = 'mixed'
                   , resampling_type = 'rus'}


rouC_paramD = {input_df = X_rouC
                   , target = y_rouC
                   , var_type = 'mixed'
                   , resampling_type = 'rouC'}


#====
scores_7030D = MultModelsCl(**rouC_paramD
                    , tts_split_type = tts_split_7030
                    , skf_cv = skf_cv
                    , blind_test_df = X_bts
                    , blind_test_target = y_bts
                    , add_cm = True
                    , add_yn = True)

###############################################################################
#%% SMOTE NC: Smote Oversampling [Numerical + categorical]
#================
# Baseline
# SMOTE NC: SMNC
#================
smnc_scores_mmD = MultModelsCl(input_df = X_smnc
                    , target = y_smnc
                    , var_type = 'mixed'
                    , tts_split_type = tts_split_7030
                    , resampling_type = 'smnc'
                    , skf_cv = skf_cv
                    , blind_test_df = X_bts
                    , blind_test_target = y_bts
                    , add_cm = True
                    , add_yn = True)

smnc_all_scores = pd.DataFrame(smnc_scores_mmD)
rs_smnc = 'smnc'
#------------------------
#  WF: only CV and BTS
#-----------------------
smnc_allT = smnc_all_scores.T

smnc_CV = smnc_allT.filter(regex='test_', axis = 1); smnc_CV.columns
# map colnames for consistency to allow concatenting
smnc_CV.columns = smnc_CV.columns.map(scoreCV_mapD); smnc_CV.columns
smnc_CV['Data_source'] = 'CV'
smnc_CV['Resampling']  = rs_smnc

smnc_BT = smnc_allT.filter(regex='bts_', axis = 1); smnc_BT.columns
# map colnames for consistency to allow concatenting
smnc_BT.columns = smnc_BT.columns.map(scoreBT_mapD); smnc_BT.columns
smnc_BT['Data_source'] = 'BT'
smnc_BT['Resampling']  = rs_smnc

# Write csv
# smnc_CV.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
# smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# smnc_CV.to_csv(outdir_ml + gene.lower() + '_smnc_CV_allF.csv')
# smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')

# other data dependent variables
training_size_smnc = len(X_smnc)
n_features         = len(X_smnc.columns)
yc1_smnc              = Counter(y_smnc)
yc1_ratio_smnc        = yc1_smnc[0]/yc1_smnc[1]

smnc_all['training_size']   = training_size_smnc
smnc_all['trainingY_ratio'] = round(yc1_ratio_smnc,2)
smnc_all['n_features']      = n_features

###############################################################################

###############################################################################
###############################################################################
#%% COMBINING all dfs: WF and LF
# https://stackoverflow.com/questions/39862654/pandas-concat-of-multiple-data-frames-using-only-common-columns


###############################################################################
#====================
# Write output file
#====================
#combined_baseline_wf.to_csv(outFile_wf, index = False)
#print('\nFile successfully written:', outFile_wf)
###############################################################################