diff --git a/scripts/ml/run_7030_LOOP.py b/scripts/ml/run_7030_LOOP.py new file mode 100644 index 0000000..74fc666 --- /dev/null +++ b/scripts/ml/run_7030_LOOP.py @@ -0,0 +1,229 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon Jun 20 13:05:23 2022 + +@author: tanu +""" +import re +import argparse +############################################################################### +# gene = 'pncA' +# drug = 'pyrazinamide' +#total_mtblineage_uc = 8 + +#%% command line args: case sensitive +arg_parser = argparse.ArgumentParser() +arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '') +arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '') +args = arg_parser.parse_args() + +drug = args.drug +gene = args.gene + +############################################################################### + +############################################################################### +#================== +# Import data +#================== +from ml_data_7030 import * +setvars(gene,drug) +from ml_data_7030 import * + +# from YC run_all_ML: run locally +#from UQ_yc_RunAllClfs import run_all_ML + +#==================== +# Import ML function +#==================== +# TT run all ML clfs: baseline model +from MultModelsCl import MultModelsCl + +#================== +# other vars +#================== +tts_split_7030 = '70_30' +OutFile_suffix = '7030' +#================== +# Specify outdir +#================== +outdir_ml = outdir + 'ml/tts_7030/' +print('\nOutput directory:', outdir_ml) + +outFile_wf = outdir_ml + gene.lower() + '_baselineC_' + OutFile_suffix + '.csv' +outFile_lf = outdir_ml + gene.lower() + '_baselineC_ext_' + OutFile_suffix + '.csv' + +############################################################################### +score_type_ordermapD = { 'mcc' : 1 + , 'fscore' : 2 + , 'jcc' : 3 + , 'precision' : 4 + , 'recall' : 5 + , 'accuracy' : 6 + , 'roc_auc' : 7 + , 'TN' : 8 + , 'FP' : 9 + , 'FN' : 10 + , 'TP' : 11 + , 'trainingY_neg': 12 + , 'trainingY_pos': 13 + , 'blindY_neg' : 14 + , 'blindY_pos' : 15 + , 'fit_time' : 16 + , 'score_time' : 17 + } + +scoreCV_mapD = {'test_mcc' : 'MCC' + , 'test_fscore' : 'F1' + , 'test_precision' : 'Precision' + , 'test_recall' : 'Recall' + , 'test_accuracy' : 'Accuracy' + , 'test_roc_auc' : 'ROC_AUC' + , 'test_jcc' : 'JCC' + } + +scoreBT_mapD = {'bts_mcc' : 'MCC' + , 'bts_fscore' : 'F1' + , 'bts_precision' : 'Precision' + , 'bts_recall' : 'Recall' + , 'bts_accuracy' : 'Accuracy' + , 'bts_roc_auc' : 'ROC_AUC' + , 'bts_jcc' : 'JCC' + } + +# # data dependent variables but NOT dependent on resampling +# bts_size = len(X_bts) +# yc2 = Counter(y_bts) +# yc2_ratio = yc2[0]/yc2[1] + +############################################################################### +print('\n#####################################################################\n' + , '\nRunning ML analysis: feature groups ' + , '\nGene name:', gene + , '\nDrug name:', drug) + + + +fooD = {'baseline_paramD': { + 'input_df': X + , 'target': y + , 'var_type': 'mixed' + , 'resampling_type': 'none'} + , + 'smnc_paramD': {'input_df': X_smnc + , 'target': y_smnc + , 'var_type': 'mixed' + , 'resampling_type': 'smnc'} +} + +barD = {} +for k, v in fooD.items(): + #print(k) + print(fooD[k]) + scores_7030D = MultModelsCl(**fooD[k] + , tts_split_type = tts_split_7030 + , skf_cv = skf_cv + , blind_test_df = X_bts + , blind_test_target = y_bts + , add_cm = True + , add_yn = True) + barD[k] = scores_7030D + + +ros_paramD = {input_df = X_ros + , target = y_ros + , var_type = 'mixed' + , resampling_type = 'smnc'} + + +rus_paramD = {input_df = X_rus + , target = y_rus + , var_type = 'mixed' + , resampling_type = 'rus'} + + +rouC_paramD = {input_df = X_rouC + , target = y_rouC + , var_type = 'mixed' + , resampling_type = 'rouC'} + + + + +#==== +scores_7030D = MultModelsCl(**rouC_paramD + , tts_split_type = tts_split_7030 + , skf_cv = skf_cv + , blind_test_df = X_bts + , blind_test_target = y_bts + , add_cm = True + , add_yn = True) + +############################################################################### +#%% SMOTE NC: Smote Oversampling [Numerical + categorical] +#================ +# Baseline +# SMOTE NC: SMNC +#================ +smnc_scores_mmD = MultModelsCl(input_df = X_smnc + , target = y_smnc + , var_type = 'mixed' + , tts_split_type = tts_split_7030 + , resampling_type = 'smnc' + , skf_cv = skf_cv + , blind_test_df = X_bts + , blind_test_target = y_bts + , add_cm = True + , add_yn = True) + +smnc_all_scores = pd.DataFrame(smnc_scores_mmD) +rs_smnc = 'smnc' +#------------------------ +# WF: only CV and BTS +#----------------------- +smnc_allT = smnc_all_scores.T + +smnc_CV = smnc_allT.filter(regex='test_', axis = 1); smnc_CV.columns +# map colnames for consistency to allow concatenting +smnc_CV.columns = smnc_CV.columns.map(scoreCV_mapD); smnc_CV.columns +smnc_CV['Data_source'] = 'CV' +smnc_CV['Resampling'] = rs_smnc + +smnc_BT = smnc_allT.filter(regex='bts_', axis = 1); smnc_BT.columns +# map colnames for consistency to allow concatenting +smnc_BT.columns = smnc_BT.columns.map(scoreBT_mapD); smnc_BT.columns +smnc_BT['Data_source'] = 'BT' +smnc_BT['Resampling'] = rs_smnc + +# Write csv +# smnc_CV.sort_values(by = ['test_mcc'], ascending = False, inplace = True) +# smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) +# smnc_CV.to_csv(outdir_ml + gene.lower() + '_smnc_CV_allF.csv') +# smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv') + +# other data dependent variables +training_size_smnc = len(X_smnc) +n_features = len(X_smnc.columns) +yc1_smnc = Counter(y_smnc) +yc1_ratio_smnc = yc1_smnc[0]/yc1_smnc[1] + +smnc_all['training_size'] = training_size_smnc +smnc_all['trainingY_ratio'] = round(yc1_ratio_smnc,2) +smnc_all['n_features'] = n_features + +############################################################################### + +############################################################################### +############################################################################### +#%% COMBINING all dfs: WF and LF +# https://stackoverflow.com/questions/39862654/pandas-concat-of-multiple-data-frames-using-only-common-columns + + +############################################################################### +#==================== +# Write output file +#==================== +#combined_baseline_wf.to_csv(outFile_wf, index = False) +#print('\nFile successfully written:', outFile_wf) +############################################################################### \ No newline at end of file