#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Mon Jun 20 13:05:23 2022 @author: tanu """ import re import argparse ############################################################################### # gene = 'pncA' # drug = 'pyrazinamide' #total_mtblineage_uc = 8 #%% command line args: case sensitive arg_parser = argparse.ArgumentParser() arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '') arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '') args = arg_parser.parse_args() drug = args.drug gene = args.gene ############################################################################### #================== # other vars #================== tts_split = '70/30' OutFile_suffix = '7030' ############################################################################### #================== # Import data #================== from ml_data_7030 import * setvars(gene,drug) from ml_data_7030 import * # from YC run_all_ML: run locally #from UQ_yc_RunAllClfs import run_all_ML #==================== # Import ML function #==================== # TT run all ML clfs: baseline model from MultModelsCl import MultModelsCl #================== # Specify outdir #================== outdir_ml = outdir + 'ml/tts_7030/' print('\nOutput directory:', outdir_ml) outFile_wf = outdir_ml + gene.lower() + '_baselineC_' + OutFile_suffix + '.csv' outFile_lf = outdir_ml + gene.lower() + '_baselineC_ext_' + OutFile_suffix + '.csv' ############################################################################### score_type_ordermapD = { 'mcc' : 1 , 'fscore' : 2 , 'jcc' : 3 , 'precision' : 4 , 'recall' : 5 , 'accuracy' : 6 , 'roc_auc' : 7 , 'TN' : 8 , 'FP' : 9 , 'FN' : 10 , 'TP' : 11 , 'trainingY_neg': 12 , 'trainingY_pos': 13 , 'blindY_neg' : 14 , 'blindY_pos' : 15 , 'fit_time' : 16 , 'score_time' : 17 } scoreCV_mapD = {'test_mcc' : 'MCC' , 'test_fscore' : 'F1' , 'test_precision' : 'Precision' , 'test_recall' : 'Recall' , 'test_accuracy' : 'Accuracy' , 'test_roc_auc' : 'ROC_AUC' , 'test_jcc' : 'JCC' } scoreBT_mapD = {'bts_mcc' : 'MCC' , 'bts_fscore' : 'F1' , 'bts_precision' : 'Precision' , 'bts_recall' : 'Recall' , 'bts_accuracy' : 'Accuracy' , 'bts_roc_auc' : 'ROC_AUC' , 'bts_jcc' : 'JCC' } # data dependent variables but NOT dependent on resampling bts_size = len(X_bts) yc2 = Counter(y_bts) yc2_ratio = yc2[0]/yc2[1] ############################################################################### print('\n#####################################################################\n' , '\nRunning ML analysis: feature groups ' , '\nGene name:', gene , '\nDrug name:', drug) #%% Basic: No Oversampling #================ # Baseline # No resampling #================ scores_mmD = MultModelsCl(input_df = X , target = y , var_type = 'mixed' , skf_cv = skf_cv , blind_test_df = X_bts , blind_test_target = y_bts , add_cm = True , add_yn = True) baseline_all_scores = pd.DataFrame(scores_mmD) rs_none = 'none' #------------------------ # WF: only CV and BTS #----------------------- baseline_allT = baseline_all_scores.T #baseline_train = baseline_all.filter(regex='train_', axis=1) baseline_CV = baseline_allT.filter(regex='test_', axis = 1); baseline_CV.columns # map colnames for consistency to allow concatenting baseline_CV.columns = baseline_CV.columns.map(scoreCV_mapD); baseline_CV.columns baseline_CV['Data_source'] = 'CV' baseline_CV['Resampling'] = rs_none baseline_BT = baseline_allT.filter(regex='bts_', axis = 1); baseline_BT.columns # map colnames for consistency to allow concatenting baseline_BT.columns = baseline_BT.columns.map(scoreBT_mapD); baseline_BT.columns baseline_BT['Data_source'] = 'BT' baseline_BT['Resampling'] = rs_none # # Write csv #baseline_CV.sort_values(by = ['test_mcc'], ascending = False, inplace = True) #baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) # baseline_CV.to_csv(outdir_ml + gene.lower() + '_baseline_CV_allF.csv') # baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv') #---------------------------------- # LF*: CV + BTS + Other info #----------------------------------- # other data dependent variables training_size_ns = len(X) n_features = len(X.columns) yc1 = Counter(y) yc1_ratio = yc1[0]/yc1[1] baseline_all = baseline_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0) baseline_all = baseline_all.reset_index() baseline_all.rename(columns = {'index': 'original_names'}, inplace = True) # Indicate whether BT or CV bt_pattern = re.compile(r'bts_.*') baseline_all['Data_source'] = baseline_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1) baseline_all['score_type'] = baseline_all['original_names'].str.replace('bts_|test_', '', regex = True) score_type_uniqueN = set(baseline_all['score_type']) cL1 = list(score_type_ordermapD.keys()) cL2 = list(score_type_uniqueN) if set(cL1).issubset(cL2): print('\nPASS: sorting df by score that is mapped onto the order I want') baseline_all['score_order'] = baseline_all['score_type'].map(score_type_ordermapD) baseline_all.sort_values(by = ['Data_source', 'score_order'], ascending = [True, True], inplace = True) else: sys.exit('\nFAIL: could not sort df as score mapping for ordering failed') # add cols: specific baseline_all['Resampling'] = rs_none baseline_all['training_size'] = training_size_ns baseline_all['trainingY_ratio']= round(yc1_ratio,2) baseline_all['n_features'] = n_features ############################################################################### #%% SMOTE NC: Smote Oversampling [Numerical + categorical] #================ # Baseline # SMOTE NC: SMNC #================ smnc_scores_mmD = MultModelsCl(input_df = X_smnc , target = y_smnc , var_type = 'mixed' , skf_cv = skf_cv , blind_test_df = X_bts , blind_test_target = y_bts , add_cm = True , add_yn = True) smnc_all_scores = pd.DataFrame(smnc_scores_mmD) rs_smnc = 'smnc' #------------------------ # WF: only CV and BTS #----------------------- smnc_allT = smnc_all_scores.T smnc_CV = smnc_allT.filter(regex='test_', axis = 1); smnc_CV.columns # map colnames for consistency to allow concatenting smnc_CV.columns = smnc_CV.columns.map(scoreCV_mapD); smnc_CV.columns smnc_CV['Data_source'] = 'CV' smnc_CV['Resampling'] = rs_smnc smnc_BT = smnc_allT.filter(regex='bts_', axis = 1); smnc_BT.columns # map colnames for consistency to allow concatenting smnc_BT.columns = smnc_BT.columns.map(scoreBT_mapD); smnc_BT.columns smnc_BT['Data_source'] = 'BT' smnc_BT['Resampling'] = rs_smnc # Write csv # smnc_CV.sort_values(by = ['test_mcc'], ascending = False, inplace = True) # smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) # smnc_CV.to_csv(outdir_ml + gene.lower() + '_smnc_CV_allF.csv') # smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv') #---------------------------------- # LF*: CV + BTS + Other info #----------------------------------- # other data dependent variables training_size_smnc = len(X_smnc) n_features = len(X_smnc.columns) yc1_smnc = Counter(y_smnc) yc1_ratio_smnc = yc1_smnc[0]/yc1_smnc[1] smnc_all = smnc_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0) smnc_all = smnc_all.reset_index() smnc_all.rename(columns = {'index': 'original_names'}, inplace = True) # Indicate whether BT or CV bt_pattern = re.compile(r'bts_.*') smnc_all['Data_source'] = smnc_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1) smnc_all['score_type'] = smnc_all['original_names'].str.replace('bts_|test_', '', regex = True) score_type_uniqueN = set(smnc_all['score_type']) cL1 = list(score_type_ordermapD.keys()) cL2 = list(score_type_uniqueN) if set(cL1).issubset(cL2): print('\nPASS: sorting df by score that is mapped onto the order I want') smnc_all['score_order'] = smnc_all['score_type'].map(score_type_ordermapD) smnc_all.sort_values(by = ['Data_source', 'score_order'], ascending = [True, True], inplace = True) else: sys.exit('\nFAIL: could not sort df as score mapping for ordering failed') # add cols: specific smnc_all['Resampling'] = rs_smnc smnc_all['training_size'] = training_size_smnc smnc_all['trainingY_ratio'] = round(yc1_ratio_smnc,2) smnc_all['n_features'] = n_features ############################################################################### #%% ROS: Random Over Sampling [Numerical + categorical] #================ # Baseline # ROS #================ ros_scores_mmD = MultModelsCl(input_df = X_ros , target = y_ros , var_type = 'mixed' , skf_cv = skf_cv , blind_test_df = X_bts , blind_test_target = y_bts , add_cm = True , add_yn = True) ros_all_scores = pd.DataFrame(ros_scores_mmD) rs_ros = 'ros' #------------------------ # WF: only CV and BTS #----------------------- ros_allT = ros_all_scores.T ros_CV = ros_allT.filter(regex='test_', axis = 1); ros_CV.columns # map colnames for consistency to allow concatenting ros_CV.columns = ros_CV.columns.map(scoreCV_mapD); ros_CV.columns ros_CV['Data_source'] = 'CV' ros_CV['Resampling'] = rs_ros ros_BT = ros_allT.filter(regex='bts_', axis = 1); ros_BT.columns # map colnames for consistency to allow concatenting ros_BT.columns = ros_BT.columns.map(scoreBT_mapD); ros_BT.columns ros_BT['Data_source'] = 'BT' ros_BT['Resampling'] = rs_ros # Write csv # ros_CV.sort_values(by = ['test_mcc'], ascending = False, inplace = True) # ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) # ros_CV.to_csv(outdir_ml + gene.lower() + '_ros_CV_allF.csv') # ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv') #---------------------------------- # LF*: CV + BTS + Other info #---------------------------------- # other data dependent variables training_size_ros = len(X_ros) n_features = len(X_ros.columns) yc1_ros = Counter(y_ros) yc1_ratio_ros = yc1_ros[0]/yc1_ros[1] ros_all = ros_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0) ros_all = ros_all.reset_index() ros_all.rename(columns = {'index': 'original_names'}, inplace = True) # Indicate whether BT or CV bt_pattern = re.compile(r'bts_.*') ros_all['Data_source'] = ros_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1) ros_all['score_type'] = ros_all['original_names'].str.replace('bts_|test_', '', regex = True) score_type_uniqueN = set(ros_all['score_type']) cL1 = list(score_type_ordermapD.keys()) cL2 = list(score_type_uniqueN) if set(cL1).issubset(cL2): print('\nPASS: sorting df by score that is mapped onto the order I want') ros_all['score_order'] = ros_all['score_type'].map(score_type_ordermapD) ros_all.sort_values(by = ['Data_source', 'score_order'], ascending = [True, True], inplace = True) else: sys.exit('\nFAIL: could not sort df as score mapping for ordering failed') # add cols: specific ros_all['Resampling'] = rs_ros ros_all['training_size'] = training_size_ros ros_all['trainingY_ratio'] = round(yc1_ratio_ros,2) ros_all['n_features'] = n_features ############################################################################### #%% RUS: Random Under Sampling [Numerical + categorical] #================ # Baseline # RUS #================ rus_scores_mmD = MultModelsCl(input_df = X_rus , target = y_rus , var_type = 'mixed' , skf_cv = skf_cv , blind_test_df = X_bts , blind_test_target = y_bts , add_cm = True , add_yn = True) rus_all_scores = pd.DataFrame(rus_scores_mmD) rs_rus = 'rus' #----------------------- # WF: only CV and BTS #----------------------- rus_allT = rus_all_scores.T rus_CV = rus_allT.filter(regex='test_', axis=1); rus_CV.columns # map colnames for consistency to allow concatenting rus_CV.columns = rus_CV.columns.map(scoreCV_mapD); rus_CV.columns rus_CV['Data_source'] = 'CV' rus_CV['Resampling'] = rs_rus rus_BT = rus_allT.filter(regex='bts_' , axis=1); rus_BT.columns # map colnames for consistency to allow concatenting rus_BT.columns = rus_BT.columns.map(scoreBT_mapD); rus_BT.columns rus_BT['Data_source'] = 'BT' rus_BT['Resampling'] = rs_rus # # Write csv # rus_CV.sort_values(by = ['test_mcc'], ascending = False, inplace = True) # rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) # rus_CV.to_csv(outdir_ml + gene.lower() + '_rus_CV_allF.csv') # rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv') #---------------------------------- # LF*: CV + BTS + Other info #---------------------------------- # other data dependent variables training_size_rus = len(X_rus) n_features = len(X_rus.columns) yc1_rus = Counter(y_rus) yc1_ratio_rus = yc1_rus[0]/yc1_rus[1] rus_all = rus_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0) rus_all = rus_all.reset_index() rus_all.rename(columns = {'index': 'original_names'}, inplace = True) # Indicate whether BT or CV bt_pattern = re.compile(r'bts_.*') rus_all['Data_source'] = rus_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1) rus_all['score_type'] = rus_all['original_names'].str.replace('bts_|test_', '', regex = True) score_type_uniqueN = set(rus_all['score_type']) cL1 = list(score_type_ordermapD.keys()) cL2 = list(score_type_uniqueN) if set(cL1).issubset(cL2): print('\nPASS: sorting df by score that is mapped onto the order I want') rus_all['score_order'] = rus_all['score_type'].map(score_type_ordermapD) rus_all.sort_values(by = ['Data_source', 'score_order'], ascending = [True, True], inplace = True) else: sys.exit('\nFAIL: could not sort df as score mapping for ordering failed') # add cols: specific rus_all['Resampling'] = rs_rus rus_all['training_size'] = training_size_rus rus_all['trainingY_ratio'] = round(yc1_ratio_rus,2) rus_all['n_features'] = n_features ############################################################################### #%% ROS+RUS Combined: [Numerical + categorical] #================ # Baseline # ROUC #================ rouC_scores_mmD = MultModelsCl(input_df = X_rouC , target = y_rouC , var_type = 'mixed' , skf_cv = skf_cv , blind_test_df = X_bts , blind_test_target = y_bts , add_cm = True , add_yn = True) rouC_all_scores = pd.DataFrame(rouC_scores_mmD) rs_rouC = 'rouC' #----------------------- # WF: only CV and BTS #----------------------- rouC_allT = rouC_all_scores.T rouC_CV = rouC_allT.filter(regex='test_', axis=1); rouC_CV.columns # map colnames for consistency to allow concatenting rouC_CV.columns = rouC_CV.columns.map(scoreCV_mapD); rouC_CV.columns rouC_CV['Data_source'] = 'CV' rouC_CV['Resampling'] = rs_rouC rouC_BT = rouC_allT.filter(regex='bts_', axis=1); rouC_BT.columns # map colnames for consistency to allow concatenting rouC_BT.columns = rouC_BT.columns.map(scoreBT_mapD); rouC_BT.columns rouC_BT['Data_source'] = 'BT' rouC_BT['Resampling'] = rs_rouC # Write csv # rouC_CV.sort_values(by = ['test_mcc'], ascending = False, inplace = True) # rouC_CV.to_csv(outdir_ml + gene.lower() + '_rouC_CV_allF.csv') # rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) # rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv') #---------------------------------- # LF*: CV + BTS + Other info #---------------------------------- # other data dependent variables training_size_rouC = len(X_rouC) n_features = len(X_rouC.columns) yc1_rouC = Counter(y_rouC) yc1_ratio_rouC = yc1_rouC[0]/yc1_rouC[1] rouC_all = rouC_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0) rouC_all = rouC_all.reset_index() rouC_all.rename(columns = {'index': 'original_names'}, inplace = True) # Indicate whether BT or CV bt_pattern = re.compile(r'bts_.*') rouC_all['Data_source'] = rouC_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1) rouC_all['score_type'] = rouC_all['original_names'].str.replace('bts_|test_', '', regex = True) score_type_uniqueN = set(rouC_all['score_type']) cL1 = list(score_type_ordermapD.keys()) cL2 = list(score_type_uniqueN) if set(cL1).issubset(cL2): print('\nPASS: sorting df by score that is mapped onto the order I want') rouC_all['score_order'] = rouC_all['score_type'].map(score_type_ordermapD) rouC_all.sort_values(by = ['Data_source', 'score_order'], ascending = [True, True], inplace = True) else: sys.exit('\nFAIL: could not sort df as score mapping for ordering failed') # add cols: specific rouC_all['Resampling'] = rs_rouC rouC_all['training_size'] = training_size_rouC rouC_all['trainingY_ratio'] = round(yc1_ratio_rouC,2) rouC_all['n_features'] = n_features ############################################################################### #%% COMBINING all dfs: WF and LF # https://stackoverflow.com/questions/39862654/pandas-concat-of-multiple-data-frames-using-only-common-columns #%% Combine WF #----------------- # Combine WF #----------------- dfs_combine_wf = [baseline_BT, smnc_BT, ros_BT, rus_BT, rouC_BT, baseline_CV, smnc_CV, ros_CV, rus_CV, rouC_CV] dfs_nrows_wf = [] for df in dfs_combine_wf: dfs_nrows_wf = dfs_nrows_wf + [len(df)] dfs_nrows_wf = max(dfs_nrows_wf) dfs_ncols_wf = [] for df in dfs_combine_wf: dfs_ncols_wf = dfs_ncols_wf + [len(df.columns)] dfs_ncols_wf = max(dfs_ncols_wf) expected_nrows_wf = len(dfs_combine_wf) * dfs_nrows_wf expected_ncols_wf = dfs_ncols_wf common_cols_wf = list(set.intersection(*(set(df.columns) for df in dfs_combine_wf))) if len(common_cols_wf) == dfs_ncols_wf : combined_baseline_wf = pd.concat([df[common_cols_wf] for df in dfs_combine_wf], ignore_index=False) resampling_methods_wf = combined_baseline_wf[['Resampling']] resampling_methods_wf = resampling_methods_wf.drop_duplicates() print('\nConcatenating dfs with different resampling methods [WF]:', tts_split , '\nNo. of dfs combining:', len(dfs_combine_wf) , '\nThe sampling methods are:' , '\n', resampling_methods_wf) if len(combined_baseline_wf) == expected_nrows_wf and len(combined_baseline_wf.columns) == expected_ncols_wf: print('\nPASS:', len(dfs_combine_wf), 'dfs successfully combined' , '\nnrows in combined_df_wf:', len(combined_baseline_wf) , '\nncols in combined_df_wf:', len(combined_baseline_wf.columns)) else: print('\nFAIL: concatenating failed' , '\nExpected nrows:', expected_nrows_wf , '\nGot:', len(combined_baseline_wf) , '\nExpected ncols:', expected_ncols_wf , '\nGot:', len(combined_baseline_wf.columns)) sys.exit() else: sys.exit('\nConcatenting dfs not possible [WF],check numbers ') # Add index as a column combined_baseline_wf.columns combined_baseline_wf = combined_baseline_wf.reset_index() combined_baseline_wf.rename(columns = {'index': 'Model_name'}, inplace = True) combined_baseline_wf.head() # sort df: Resampling, Data_source, and MCC combined_baseline_wf.sort_values(by = ['Resampling', 'Data_source', 'MCC'], ascending = [True, True, False], inplace = True) ############################################################################## #%% Combine LF #----------------- # Combine LF* #----------------- dfs_combine = [baseline_all, smnc_all, ros_all, rus_all, rouC_all ] dfs_nrows = [] for df in dfs_combine: dfs_nrows = dfs_nrows + [len(df)] dfs_nrows = max(dfs_nrows) dfs_ncols = [] for df in dfs_combine: dfs_ncols = dfs_ncols + [len(df.columns)] dfs_ncols = max(dfs_ncols) # dfs_ncols = [] # dfs_ncols2 = mode(dfs_ncols.append(len(df.columns) for df in dfs_combine) # dfs_ncols2 expected_nrows = len(dfs_combine) * dfs_nrows expected_ncols = dfs_ncols common_cols = list(set.intersection(*(set(df.columns) for df in dfs_combine))) if len(common_cols) == dfs_ncols : combined_baseline = pd.concat([df[common_cols] for df in dfs_combine], ignore_index=True) resampling_methods = combined_baseline[['Resampling', 'training_size']] resampling_methods = resampling_methods.drop_duplicates() print('\nConcatenating dfs with different resampling methods:', tts_split , '\nNo. of dfs combining:', len(dfs_combine) , '\nThe sampling methods are:' , '\n', resampling_methods) if len(combined_baseline) == expected_nrows and len(combined_baseline.columns) == expected_ncols: print('\nPASS:', len(dfs_combine), 'dfs successfully combined' , '\nnrows in combined_df:', len(combined_baseline) , '\nncols in combined_df:', len(combined_baseline.columns)) else: print('\nFAIL: concatenating failed' , '\nExpected nrows:', expected_nrows , '\nGot:', len(combined_baseline) , '\nExpected ncols:', expected_ncols , '\nGot:', len(combined_baseline.columns)) sys.exit() else: sys.exit('\nConcatenting dfs not possible,check numbers ') # Add further column indications combined_baseline['test_size'] = bts_size combined_baseline['tts_split'] = tts_split combined_baseline['testY_ratio'] = round(yc2_ratio,2) #combined_baseline.columns # change to column names to be lower case for consistency combined_baseline.rename(columns = {'Resampling' : 'resampling' , 'Data_source': 'data_source'}, inplace = True) combined_baseline.columns # sort df: resampling, data_source, mcc combined_baseline.sort_values(by = ['resampling', 'data_source', 'score_order'], ascending = [True, True, True], inplace = True) # # rpow bind # if all(XXX): # print('\nPASS:colnames match, proceeding to rowbind') # comb_df = pd.concat()], axis = 0, ignore_index = True ) combined_baseline ############################################################################### #==================== # Write output file #==================== combined_baseline_wf.to_csv(outFile_wf, index = False) print('\nFile successfully written:', outFile_wf) combined_baseline.to_csv(outFile_lf, index = False) print('\nFile successfully written:', outFile_lf) ###############################################################################