added run_7030_LOOP.py to loop through the resampling data and get processed output

2022-06-23 21:29:54 +01:00 · 2022-06-23 21:29:54 +01:00 · 3514e1b4ba
commit 3514e1b4ba
parent 1d3190899d
1 changed files with 229 additions and 0 deletions
--- a/scripts/ml/run_7030_LOOP.py
+++ b/scripts/ml/run_7030_LOOP.py
@ -0,0 +1,229 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Jun 20 13:05:23 2022
+
+@author: tanu
+"""
+import re
+import argparse
+###############################################################################
+# gene  = 'pncA'
+# drug  = 'pyrazinamide'
+#total_mtblineage_uc = 8
+
+#%% command line args: case sensitive
+arg_parser = argparse.ArgumentParser()
+arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '')
+arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '') 
+args = arg_parser.parse_args()
+
+drug    = args.drug
+gene    = args.gene
+
+###############################################################################
+
+###############################################################################
+#==================
+# Import data
+#==================
+from ml_data_7030 import *
+setvars(gene,drug)
+from ml_data_7030 import *
+
+# from YC run_all_ML: run locally
+#from UQ_yc_RunAllClfs import run_all_ML
+
+#====================
+# Import ML function 
+#====================
+# TT run all ML clfs: baseline model
+from MultModelsCl import MultModelsCl
+
+#==================
+# other vars
+#==================
+tts_split_7030    = '70_30'
+OutFile_suffix  = '7030'
+#==================
+# Specify outdir 
+#==================
+outdir_ml = outdir + 'ml/tts_7030/'
+print('\nOutput directory:', outdir_ml)
+
+outFile_wf = outdir_ml + gene.lower() + '_baselineC_' + OutFile_suffix + '.csv'
+outFile_lf = outdir_ml + gene.lower() + '_baselineC_ext_' + OutFile_suffix + '.csv'
+
+###############################################################################
+score_type_ordermapD = { 'mcc'      : 1
+                   , 'fscore'       : 2
+                   , 'jcc'          : 3
+                   , 'precision'    : 4
+                   , 'recall'       : 5      
+                   , 'accuracy'     : 6  
+                   , 'roc_auc'      : 7
+                   , 'TN'           : 8
+                   , 'FP'           : 9
+                   , 'FN'           : 10
+                   , 'TP'           : 11  
+                   , 'trainingY_neg': 12  
+                   , 'trainingY_pos': 13    
+                   , 'blindY_neg'   : 14
+                   , 'blindY_pos'   : 15
+                   , 'fit_time'     : 16
+                   , 'score_time'   : 17
+                   }
+
+scoreCV_mapD = {'test_mcc'         : 'MCC'
+                , 'test_fscore'    : 'F1'
+                , 'test_precision' : 'Precision'
+                , 'test_recall'    : 'Recall'
+                , 'test_accuracy'  : 'Accuracy'
+                , 'test_roc_auc'   : 'ROC_AUC'
+                , 'test_jcc'       : 'JCC'
+                }
+
+scoreBT_mapD = {'bts_mcc'          : 'MCC'
+                , 'bts_fscore'     : 'F1'
+                , 'bts_precision'  : 'Precision'
+                , 'bts_recall'     : 'Recall'
+                , 'bts_accuracy'   : 'Accuracy'
+                , 'bts_roc_auc'    : 'ROC_AUC'
+                , 'bts_jcc'        : 'JCC'
+               }
+
+# # data dependent variables but NOT dependent on resampling
+# bts_size  = len(X_bts)
+# yc2       = Counter(y_bts)
+# yc2_ratio = yc2[0]/yc2[1]
+
+###############################################################################
+print('\n#####################################################################\n'
+      , '\nRunning ML analysis: feature groups '
+      , '\nGene name:', gene
+      , '\nDrug name:', drug)
+
+
+
+fooD = {'baseline_paramD': {
+                   'input_df': X
+                   , 'target': y
+                   , 'var_type': 'mixed'
+                   , 'resampling_type': 'none'}
+        ,
+        'smnc_paramD': {'input_df': X_smnc
+                   , 'target': y_smnc
+                   , 'var_type': 'mixed'
+                   , 'resampling_type': 'smnc'}
+}
+
+barD = {}
+for k, v in fooD.items():
+    #print(k)
+    print(fooD[k])
+    scores_7030D = MultModelsCl(**fooD[k]
+                        , tts_split_type = tts_split_7030
+                        , skf_cv = skf_cv
+                        , blind_test_df = X_bts
+                        , blind_test_target = y_bts
+                        , add_cm = True 
+                        , add_yn = True)
+    barD[k] = scores_7030D
+    
+
+ros_paramD = {input_df = X_ros
+                   , target = y_ros
+                   , var_type = 'mixed'
+                   , resampling_type = 'smnc'}
+
+
+rus_paramD = {input_df = X_rus
+                   , target = y_rus
+                   , var_type = 'mixed'
+                   , resampling_type = 'rus'}
+
+
+rouC_paramD = {input_df = X_rouC
+                   , target = y_rouC
+                   , var_type = 'mixed'
+                   , resampling_type = 'rouC'}
+
+
+
+
+#====
+scores_7030D = MultModelsCl(**rouC_paramD
+                    , tts_split_type = tts_split_7030
+                    , skf_cv = skf_cv
+                    , blind_test_df = X_bts
+                    , blind_test_target = y_bts
+                    , add_cm = True 
+                    , add_yn = True)
+
+###############################################################################
+#%% SMOTE NC: Smote Oversampling [Numerical + categorical]
+#================
+# Baseline
+# SMOTE NC: SMNC
+#================
+smnc_scores_mmD = MultModelsCl(input_df = X_smnc
+                    , target = y_smnc
+                    , var_type = 'mixed'
+                    , tts_split_type = tts_split_7030
+                    , resampling_type = 'smnc'
+                    , skf_cv = skf_cv
+                    , blind_test_df = X_bts
+                    , blind_test_target = y_bts
+                    , add_cm = True 
+                    , add_yn = True)
+
+smnc_all_scores = pd.DataFrame(smnc_scores_mmD)
+rs_smnc = 'smnc'
+#------------------------
+#  WF: only CV and BTS
+#-----------------------
+smnc_allT = smnc_all_scores.T
+
+smnc_CV = smnc_allT.filter(regex='test_', axis = 1); smnc_CV.columns
+# map colnames for consistency to allow concatenting
+smnc_CV.columns = smnc_CV.columns.map(scoreCV_mapD); smnc_CV.columns
+smnc_CV['Data_source'] = 'CV'
+smnc_CV['Resampling']  = rs_smnc
+
+smnc_BT = smnc_allT.filter(regex='bts_', axis = 1); smnc_BT.columns
+# map colnames for consistency to allow concatenting
+smnc_BT.columns = smnc_BT.columns.map(scoreBT_mapD); smnc_BT.columns
+smnc_BT['Data_source'] = 'BT'
+smnc_BT['Resampling']  = rs_smnc
+
+# Write csv
+# smnc_CV.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
+# smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
+# smnc_CV.to_csv(outdir_ml + gene.lower() + '_smnc_CV_allF.csv')
+# smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
+
+# other data dependent variables
+training_size_smnc = len(X_smnc)
+n_features         = len(X_smnc.columns)
+yc1_smnc              = Counter(y_smnc)
+yc1_ratio_smnc        = yc1_smnc[0]/yc1_smnc[1]
+
+smnc_all['training_size']   = training_size_smnc
+smnc_all['trainingY_ratio'] = round(yc1_ratio_smnc,2)
+smnc_all['n_features']      = n_features
+
+###############################################################################
+
+###############################################################################
+###############################################################################
+#%% COMBINING all dfs: WF and LF
+# https://stackoverflow.com/questions/39862654/pandas-concat-of-multiple-data-frames-using-only-common-columns
+
+
+###############################################################################
+#====================
+# Write output file
+#====================
+#combined_baseline_wf.to_csv(outFile_wf, index = False)
+#print('\nFile successfully written:', outFile_wf)
+###############################################################################