#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Mon Jun 20 13:05:23 2022 @author: tanu """ import re import argparse ############################################################################### # gene = 'pncA' # drug = 'pyrazinamide' #total_mtblineage_uc = 8 #%% command line args: case sensitive arg_parser = argparse.ArgumentParser() arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '') arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '') args = arg_parser.parse_args() drug = args.drug gene = args.gene ############################################################################### ############################################################################### #================== # Import data #================== from ml_data_7030 import * setvars(gene,drug) from ml_data_7030 import * # from YC run_all_ML: run locally #from UQ_yc_RunAllClfs import run_all_ML #==================== # Import ML function #==================== # TT run all ML clfs: baseline model from MultModelsCl import MultModelsCl #================== # other vars #================== tts_split_7030 = '70_30' OutFile_suffix = '7030' #================== # Specify outdir #================== outdir_ml = outdir + 'ml/tts_7030/' print('\nOutput directory:', outdir_ml) outFile_wf = outdir_ml + gene.lower() + '_baselineC_' + OutFile_suffix + '.csv' outFile_lf = outdir_ml + gene.lower() + '_baselineC_ext_' + OutFile_suffix + '.csv' ############################################################################### score_type_ordermapD = { 'mcc' : 1 , 'fscore' : 2 , 'jcc' : 3 , 'precision' : 4 , 'recall' : 5 , 'accuracy' : 6 , 'roc_auc' : 7 , 'TN' : 8 , 'FP' : 9 , 'FN' : 10 , 'TP' : 11 , 'trainingY_neg': 12 , 'trainingY_pos': 13 , 'blindY_neg' : 14 , 'blindY_pos' : 15 , 'fit_time' : 16 , 'score_time' : 17 } scoreCV_mapD = {'test_mcc' : 'MCC' , 'test_fscore' : 'F1' , 'test_precision' : 'Precision' , 'test_recall' : 'Recall' , 'test_accuracy' : 'Accuracy' , 'test_roc_auc' : 'ROC_AUC' , 'test_jcc' : 'JCC' } scoreBT_mapD = {'bts_mcc' : 'MCC' , 'bts_fscore' : 'F1' , 'bts_precision' : 'Precision' , 'bts_recall' : 'Recall' , 'bts_accuracy' : 'Accuracy' , 'bts_roc_auc' : 'ROC_AUC' , 'bts_jcc' : 'JCC' } # # data dependent variables but NOT dependent on resampling # bts_size = len(X_bts) # yc2 = Counter(y_bts) # yc2_ratio = yc2[0]/yc2[1] ############################################################################### print('\n#####################################################################\n' , '\nRunning ML analysis: feature groups ' , '\nGene name:', gene , '\nDrug name:', drug) fooD = {'baseline_paramD': { 'input_df': X , 'target': y , 'var_type': 'mixed' , 'resampling_type': 'none'} , 'smnc_paramD': {'input_df': X_smnc , 'target': y_smnc , 'var_type': 'mixed' , 'resampling_type': 'smnc'} } barD = {} for k, v in fooD.items(): #print(k) print(fooD[k]) scores_7030D = MultModelsCl(**fooD[k] , tts_split_type = tts_split_7030 , skf_cv = skf_cv , blind_test_df = X_bts , blind_test_target = y_bts , add_cm = True , add_yn = True) barD[k] = scores_7030D ros_paramD = {input_df = X_ros , target = y_ros , var_type = 'mixed' , resampling_type = 'smnc'} rus_paramD = {input_df = X_rus , target = y_rus , var_type = 'mixed' , resampling_type = 'rus'} rouC_paramD = {input_df = X_rouC , target = y_rouC , var_type = 'mixed' , resampling_type = 'rouC'} #==== scores_7030D = MultModelsCl(**rouC_paramD , tts_split_type = tts_split_7030 , skf_cv = skf_cv , blind_test_df = X_bts , blind_test_target = y_bts , add_cm = True , add_yn = True) ############################################################################### #%% SMOTE NC: Smote Oversampling [Numerical + categorical] #================ # Baseline # SMOTE NC: SMNC #================ smnc_scores_mmD = MultModelsCl(input_df = X_smnc , target = y_smnc , var_type = 'mixed' , tts_split_type = tts_split_7030 , resampling_type = 'smnc' , skf_cv = skf_cv , blind_test_df = X_bts , blind_test_target = y_bts , add_cm = True , add_yn = True , return_formatted_output = True): ) smnc_all_scores = pd.DataFrame(smnc_scores_mmD) rs_smnc = 'smnc' #------------------------ # WF: only CV and BTS #----------------------- smnc_allT = smnc_all_scores.T smnc_CV = smnc_allT.filter(regex='test_', axis = 1); smnc_CV.columns # map colnames for consistency to allow concatenting smnc_CV.columns = smnc_CV.columns.map(scoreCV_mapD); smnc_CV.columns smnc_CV['Data_source'] = 'CV' smnc_CV['Resampling'] = rs_smnc smnc_BT = smnc_allT.filter(regex='bts_', axis = 1); smnc_BT.columns # map colnames for consistency to allow concatenting smnc_BT.columns = smnc_BT.columns.map(scoreBT_mapD); smnc_BT.columns smnc_BT['Data_source'] = 'BT' smnc_BT['Resampling'] = rs_smnc # Write csv # smnc_CV.sort_values(by = ['test_mcc'], ascending = False, inplace = True) # smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) # smnc_CV.to_csv(outdir_ml + gene.lower() + '_smnc_CV_allF.csv') # smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv') # other data dependent variables training_size_smnc = len(X_smnc) n_features = len(X_smnc.columns) yc1_smnc = Counter(y_smnc) yc1_ratio_smnc = yc1_smnc[0]/yc1_smnc[1] smnc_all['training_size'] = training_size_smnc smnc_all['trainingY_ratio'] = round(yc1_ratio_smnc,2) smnc_all['n_features'] = n_features ############################################################################### ############################################################################### ############################################################################### #%% COMBINING all dfs: WF and LF # https://stackoverflow.com/questions/39862654/pandas-concat-of-multiple-data-frames-using-only-common-columns ############################################################################### #==================== # Write output file #==================== #combined_baseline_wf.to_csv(outFile_wf, index = False) #print('\nFile successfully written:', outFile_wf) ###############################################################################