added run_7030_LOOP.py to loop through the resampling data and get processed output
This commit is contained in:
parent
1d3190899d
commit
3514e1b4ba
1 changed files with 229 additions and 0 deletions
229
scripts/ml/run_7030_LOOP.py
Normal file
229
scripts/ml/run_7030_LOOP.py
Normal file
|
@ -0,0 +1,229 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Mon Jun 20 13:05:23 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
import re
|
||||
import argparse
|
||||
###############################################################################
|
||||
# gene = 'pncA'
|
||||
# drug = 'pyrazinamide'
|
||||
#total_mtblineage_uc = 8
|
||||
|
||||
#%% command line args: case sensitive
|
||||
arg_parser = argparse.ArgumentParser()
|
||||
arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '')
|
||||
arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '')
|
||||
args = arg_parser.parse_args()
|
||||
|
||||
drug = args.drug
|
||||
gene = args.gene
|
||||
|
||||
###############################################################################
|
||||
|
||||
###############################################################################
|
||||
#==================
|
||||
# Import data
|
||||
#==================
|
||||
from ml_data_7030 import *
|
||||
setvars(gene,drug)
|
||||
from ml_data_7030 import *
|
||||
|
||||
# from YC run_all_ML: run locally
|
||||
#from UQ_yc_RunAllClfs import run_all_ML
|
||||
|
||||
#====================
|
||||
# Import ML function
|
||||
#====================
|
||||
# TT run all ML clfs: baseline model
|
||||
from MultModelsCl import MultModelsCl
|
||||
|
||||
#==================
|
||||
# other vars
|
||||
#==================
|
||||
tts_split_7030 = '70_30'
|
||||
OutFile_suffix = '7030'
|
||||
#==================
|
||||
# Specify outdir
|
||||
#==================
|
||||
outdir_ml = outdir + 'ml/tts_7030/'
|
||||
print('\nOutput directory:', outdir_ml)
|
||||
|
||||
outFile_wf = outdir_ml + gene.lower() + '_baselineC_' + OutFile_suffix + '.csv'
|
||||
outFile_lf = outdir_ml + gene.lower() + '_baselineC_ext_' + OutFile_suffix + '.csv'
|
||||
|
||||
###############################################################################
|
||||
score_type_ordermapD = { 'mcc' : 1
|
||||
, 'fscore' : 2
|
||||
, 'jcc' : 3
|
||||
, 'precision' : 4
|
||||
, 'recall' : 5
|
||||
, 'accuracy' : 6
|
||||
, 'roc_auc' : 7
|
||||
, 'TN' : 8
|
||||
, 'FP' : 9
|
||||
, 'FN' : 10
|
||||
, 'TP' : 11
|
||||
, 'trainingY_neg': 12
|
||||
, 'trainingY_pos': 13
|
||||
, 'blindY_neg' : 14
|
||||
, 'blindY_pos' : 15
|
||||
, 'fit_time' : 16
|
||||
, 'score_time' : 17
|
||||
}
|
||||
|
||||
scoreCV_mapD = {'test_mcc' : 'MCC'
|
||||
, 'test_fscore' : 'F1'
|
||||
, 'test_precision' : 'Precision'
|
||||
, 'test_recall' : 'Recall'
|
||||
, 'test_accuracy' : 'Accuracy'
|
||||
, 'test_roc_auc' : 'ROC_AUC'
|
||||
, 'test_jcc' : 'JCC'
|
||||
}
|
||||
|
||||
scoreBT_mapD = {'bts_mcc' : 'MCC'
|
||||
, 'bts_fscore' : 'F1'
|
||||
, 'bts_precision' : 'Precision'
|
||||
, 'bts_recall' : 'Recall'
|
||||
, 'bts_accuracy' : 'Accuracy'
|
||||
, 'bts_roc_auc' : 'ROC_AUC'
|
||||
, 'bts_jcc' : 'JCC'
|
||||
}
|
||||
|
||||
# # data dependent variables but NOT dependent on resampling
|
||||
# bts_size = len(X_bts)
|
||||
# yc2 = Counter(y_bts)
|
||||
# yc2_ratio = yc2[0]/yc2[1]
|
||||
|
||||
###############################################################################
|
||||
print('\n#####################################################################\n'
|
||||
, '\nRunning ML analysis: feature groups '
|
||||
, '\nGene name:', gene
|
||||
, '\nDrug name:', drug)
|
||||
|
||||
|
||||
|
||||
fooD = {'baseline_paramD': {
|
||||
'input_df': X
|
||||
, 'target': y
|
||||
, 'var_type': 'mixed'
|
||||
, 'resampling_type': 'none'}
|
||||
,
|
||||
'smnc_paramD': {'input_df': X_smnc
|
||||
, 'target': y_smnc
|
||||
, 'var_type': 'mixed'
|
||||
, 'resampling_type': 'smnc'}
|
||||
}
|
||||
|
||||
barD = {}
|
||||
for k, v in fooD.items():
|
||||
#print(k)
|
||||
print(fooD[k])
|
||||
scores_7030D = MultModelsCl(**fooD[k]
|
||||
, tts_split_type = tts_split_7030
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_df = X_bts
|
||||
, blind_test_target = y_bts
|
||||
, add_cm = True
|
||||
, add_yn = True)
|
||||
barD[k] = scores_7030D
|
||||
|
||||
|
||||
ros_paramD = {input_df = X_ros
|
||||
, target = y_ros
|
||||
, var_type = 'mixed'
|
||||
, resampling_type = 'smnc'}
|
||||
|
||||
|
||||
rus_paramD = {input_df = X_rus
|
||||
, target = y_rus
|
||||
, var_type = 'mixed'
|
||||
, resampling_type = 'rus'}
|
||||
|
||||
|
||||
rouC_paramD = {input_df = X_rouC
|
||||
, target = y_rouC
|
||||
, var_type = 'mixed'
|
||||
, resampling_type = 'rouC'}
|
||||
|
||||
|
||||
|
||||
|
||||
#====
|
||||
scores_7030D = MultModelsCl(**rouC_paramD
|
||||
, tts_split_type = tts_split_7030
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_df = X_bts
|
||||
, blind_test_target = y_bts
|
||||
, add_cm = True
|
||||
, add_yn = True)
|
||||
|
||||
###############################################################################
|
||||
#%% SMOTE NC: Smote Oversampling [Numerical + categorical]
|
||||
#================
|
||||
# Baseline
|
||||
# SMOTE NC: SMNC
|
||||
#================
|
||||
smnc_scores_mmD = MultModelsCl(input_df = X_smnc
|
||||
, target = y_smnc
|
||||
, var_type = 'mixed'
|
||||
, tts_split_type = tts_split_7030
|
||||
, resampling_type = 'smnc'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_df = X_bts
|
||||
, blind_test_target = y_bts
|
||||
, add_cm = True
|
||||
, add_yn = True)
|
||||
|
||||
smnc_all_scores = pd.DataFrame(smnc_scores_mmD)
|
||||
rs_smnc = 'smnc'
|
||||
#------------------------
|
||||
# WF: only CV and BTS
|
||||
#-----------------------
|
||||
smnc_allT = smnc_all_scores.T
|
||||
|
||||
smnc_CV = smnc_allT.filter(regex='test_', axis = 1); smnc_CV.columns
|
||||
# map colnames for consistency to allow concatenting
|
||||
smnc_CV.columns = smnc_CV.columns.map(scoreCV_mapD); smnc_CV.columns
|
||||
smnc_CV['Data_source'] = 'CV'
|
||||
smnc_CV['Resampling'] = rs_smnc
|
||||
|
||||
smnc_BT = smnc_allT.filter(regex='bts_', axis = 1); smnc_BT.columns
|
||||
# map colnames for consistency to allow concatenting
|
||||
smnc_BT.columns = smnc_BT.columns.map(scoreBT_mapD); smnc_BT.columns
|
||||
smnc_BT['Data_source'] = 'BT'
|
||||
smnc_BT['Resampling'] = rs_smnc
|
||||
|
||||
# Write csv
|
||||
# smnc_CV.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
# smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
# smnc_CV.to_csv(outdir_ml + gene.lower() + '_smnc_CV_allF.csv')
|
||||
# smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
|
||||
|
||||
# other data dependent variables
|
||||
training_size_smnc = len(X_smnc)
|
||||
n_features = len(X_smnc.columns)
|
||||
yc1_smnc = Counter(y_smnc)
|
||||
yc1_ratio_smnc = yc1_smnc[0]/yc1_smnc[1]
|
||||
|
||||
smnc_all['training_size'] = training_size_smnc
|
||||
smnc_all['trainingY_ratio'] = round(yc1_ratio_smnc,2)
|
||||
smnc_all['n_features'] = n_features
|
||||
|
||||
###############################################################################
|
||||
|
||||
###############################################################################
|
||||
###############################################################################
|
||||
#%% COMBINING all dfs: WF and LF
|
||||
# https://stackoverflow.com/questions/39862654/pandas-concat-of-multiple-data-frames-using-only-common-columns
|
||||
|
||||
|
||||
###############################################################################
|
||||
#====================
|
||||
# Write output file
|
||||
#====================
|
||||
#combined_baseline_wf.to_csv(outFile_wf, index = False)
|
||||
#print('\nFile successfully written:', outFile_wf)
|
||||
###############################################################################
|
Loading…
Add table
Add a link
Reference in a new issue