#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Mon Jun 20 13:05:23 2022 @author: tanu """ import re import argparse ############################################################################### # gene = 'pncA' # drug = 'pyrazinamide' #total_mtblineage_uc = 8 #%% command line args: case sensitive arg_parser = argparse.ArgumentParser() arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '') arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '') args = arg_parser.parse_args() drug = args.drug gene = args.gene ############################################################################### #================== # other vars #================== tts_split = '70/30' OutFile_suffix = '7030' ############################################################################### #================== # Import data #================== from ml_data_7030 import * setvars(gene,drug) from ml_data_7030 import * # from YC run_all_ML: run locally #from UQ_yc_RunAllClfs import run_all_ML #==================== # Import ML function #==================== # TT run all ML clfs: baseline model from MultModelsCl import MultModelsCl ############################################################################ print('\n#####################################################################\n' , '\nRunning ML analysis: feature groups ' , '\nGene name:', gene , '\nDrug name:', drug) #================== # Specify outdir #================== outdir_ml = outdir + 'ml/tts_7030/' print('\nOutput directory:', outdir_ml) outFile = outdir_ml + gene.lower() + '_baselineC_' + OutFile_suffix + '.csv' ############################################################################### score_type_ordermapD = { 'mcc' : 1 , 'fscore' : 2 , 'jcc' : 3 , 'precision' : 4 , 'recall' : 5 , 'accuracy' : 6 , 'roc_auc' : 7 , 'TN' : 8 , 'FP' : 9 , 'FN' : 10 , 'TP' : 11 , 'trainingY_neg': 12 , 'trainingY_pos': 13 , 'blindY_neg' : 14 , 'blindY_pos' : 15 , 'fit_time' : 16 , 'score_time' : 17 } # data dependent variable bts_size = len(X_bts) ############################################################################### #%% TTS: 7030 split # mm_skf_scoresD = MultModelsCl(input_df = X # , target = y # , var_type = 'mixed' # , skf_cv = skf_cv # , blind_test_input_df = X_bts # , blind_test_target = y_bts) # baseline_all = pd.DataFrame(mm_skf_scoresD) # baseline_all = baseline_all.T # #baseline_train = baseline_all.filter(like='train_', axis=1) # baseline_CT = baseline_all.filter(like='test_', axis=1) # baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) # baseline_BT = baseline_all.filter(like='bts_', axis=1) # baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) # # Write csv # baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv') # baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv') #================ # Baseline # No resampling #================ # other data dependent variables training_size_ns = len(X) n_features = len(X.columns) scores_mmD = MultModelsCl(input_df = X , target = y , var_type = 'mixed' , skf_cv = skf_cv , blind_test_input_df = X_bts , blind_test_target = y_bts , add_cm = True , add_yn = True) baseline_all_scores = pd.DataFrame(scores_mmD) baseline_all = baseline_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0) baseline_all = baseline_all.reset_index() baseline_all.rename(columns = {'index': 'original_names'}, inplace = True) # Indicate whether BT or CT bt_pattern = re.compile(r'bts_.*') baseline_all['data_source'] = baseline_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1) baseline_all['score_type'] = baseline_all['original_names'].str.replace('bts_|test_', '', regex = True) score_type_uniqueN = set(baseline_all['score_type']) cL1 = list(score_type_ordermapD.keys()) cL2 = list(score_type_uniqueN) if set(cL1).issubset(cL2): print('\nPASS: sorting df by score that is mapped onto the order I want') baseline_all['score_order'] = baseline_all['score_type'].map(score_type_ordermapD) baseline_all.sort_values(by = ['data_source', 'score_order'], ascending = [True, True], inplace = True) else: sys.exit('\nFAIL: could not sort df as score mapping for ordering failed') # add cols: specific baseline_all['resampling'] = 'none' baseline_all['training_size'] = training_size_ns # add cols: common baseline_all['n_features'] = n_features #baseline_all['test_size'] = bts_size #baseline_all['tts_split'] = tts_split ############################################################################### #%% SMOTE NC: Oversampling [Numerical + categorical] # mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc # , target = y_smnc # , var_type = 'mixed' # , skf_cv = skf_cv # , blind_test_input_df = X_bts # , blind_test_target = y_bts) # smnc_all = pd.DataFrame(mm_skf_scoresD7) # smnc_all = smnc_all.T # smnc_CT = smnc_all.filter(like='test_', axis=1) # smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) # smnc_BT = smnc_all.filter(like='bts_', axis=1) # smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) # # Write csv # smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv') # smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv') #================ # Baselone # SMOTE NC #================ # other data dependent variables training_size_smnc = len(X_smnc) n_features = len(X_smnc.columns) smnc_scores_mmD = MultModelsCl(input_df = X_smnc , target = y_smnc , var_type = 'mixed' , skf_cv = skf_cv , blind_test_input_df = X_bts , blind_test_target = y_bts , add_cm = True , add_yn = True) smnc_all_scores = pd.DataFrame(smnc_scores_mmD) smnc_all = smnc_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0) smnc_all = smnc_all.reset_index() smnc_all.rename(columns = {'index': 'original_names'}, inplace = True) # Indicate whether BT or CT bt_pattern = re.compile(r'bts_.*') smnc_all['data_source'] = smnc_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1) smnc_all['score_type'] = smnc_all['original_names'].str.replace('bts_|test_', '', regex = True) score_type_uniqueN = set(smnc_all['score_type']) cL1 = list(score_type_ordermapD.keys()) cL2 = list(score_type_uniqueN) if set(cL1).issubset(cL2): print('\nPASS: sorting df by score that is mapped onto the order I want') smnc_all['score_order'] = smnc_all['score_type'].map(score_type_ordermapD) smnc_all.sort_values(by = ['data_source', 'score_order'], ascending = [True, True], inplace = True) else: sys.exit('\nFAIL: could not sort df as score mapping for ordering failed') # add cols: specific smnc_all['resampling'] = 'smnc' smnc_all['training_size'] = training_size_smnc # add cols: common smnc_all['n_features'] = n_features #smnc_all['test_size'] = bts_size #smnc_all['tts_split'] = tts_split ############################################################################### #%% ROS: Numerical + categorical # mm_skf_scoresD3 = MultModelsCl(input_df = X_ros # , target = y_ros # , var_type = 'mixed' # , skf_cv = skf_cv # , blind_test_input_df = X_bts # , blind_test_target = y_bts) # ros_all = pd.DataFrame(mm_skf_scoresD3) # ros_all = ros_all.T # ros_CT = ros_all.filter(like='test_', axis=1) # ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) # ros_BT = ros_all.filter(like='bts_', axis=1) # ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) # # Write csv # ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv') # ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv') #================ # Baseline # ROS #================ # other data dependent variables training_size_ros = len(X_ros) n_features = len(X_ros.columns) ros_scores_mmD = MultModelsCl(input_df = X_ros , target = y_ros , var_type = 'mixed' , skf_cv = skf_cv , blind_test_input_df = X_bts , blind_test_target = y_bts , add_cm = True , add_yn = True) ros_all_scores = pd.DataFrame(ros_scores_mmD) ros_all = ros_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0) ros_all = ros_all.reset_index() ros_all.rename(columns = {'index': 'original_names'}, inplace = True) # Indicate whether BT or CT bt_pattern = re.compile(r'bts_.*') ros_all['data_source'] = ros_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1) ros_all['score_type'] = ros_all['original_names'].str.replace('bts_|test_', '', regex = True) score_type_uniqueN = set(ros_all['score_type']) cL1 = list(score_type_ordermapD.keys()) cL2 = list(score_type_uniqueN) if set(cL1).issubset(cL2): print('\nPASS: sorting df by score that is mapped onto the order I want') ros_all['score_order'] = ros_all['score_type'].map(score_type_ordermapD) ros_all.sort_values(by = ['data_source', 'score_order'], ascending = [True, True], inplace = True) else: sys.exit('\nFAIL: could not sort df as score mapping for ordering failed') # add cols: specific ros_all['resampling'] = 'ros' ros_all['training_size'] = training_size_ros # add cols: common ros_all['n_features'] = n_features #ros_all['test_size'] = bts_size #ros_all['tts_split'] = tts_split ############################################################################### #%% RUS: Numerical + categorical # mm_skf_scoresD4 = MultModelsCl(input_df = X_rus # , target = y_rus # , var_type = 'mixed' # , skf_cv = skf_cv # , blind_test_input_df = X_bts # , blind_test_target = y_bts) # rus_all = pd.DataFrame(mm_skf_scoresD4) # rus_all = rus_all.T # rus_CT = rus_all.filter(like='test_', axis=1) # rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) # rus_BT = rus_all.filter(like='bts_' , axis=1) # rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) # # Write csv # rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv') # rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv') #================ # Baseline # RUS #================ # other data dependent variables training_size_rus = len(X_rus) n_features = len(X_rus.columns) rus_scores_mmD = MultModelsCl(input_df = X_rus , target = y_rus , var_type = 'mixed' , skf_cv = skf_cv , blind_test_input_df = X_bts , blind_test_target = y_bts , add_cm = True , add_yn = True) rus_all_scores = pd.DataFrame(rus_scores_mmD) rus_all = rus_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0) rus_all = rus_all.reset_index() rus_all.rename(columns = {'index': 'original_names'}, inplace = True) # Indicate whether BT or CT bt_pattern = re.compile(r'bts_.*') rus_all['data_source'] = rus_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1) rus_all['score_type'] = rus_all['original_names'].str.replace('bts_|test_', '', regex = True) score_type_uniqueN = set(rus_all['score_type']) cL1 = list(score_type_ordermapD.keys()) cL2 = list(score_type_uniqueN) if set(cL1).issubset(cL2): print('\nPASS: sorting df by score that is mapped onto the order I want') rus_all['score_order'] = rus_all['score_type'].map(score_type_ordermapD) rus_all.sort_values(by = ['data_source', 'score_order'], ascending = [True, True], inplace = True) else: sys.exit('\nFAIL: could not sort df as score mapping for ordering failed') # add cols: specific rus_all['resampling'] = 'rus' rus_all['training_size'] = training_size_rus # add cols: common rus_all['n_features'] = n_features #rus_all['test_size'] = bts_size #rus_all['tts_split'] = tts_split ############################################################################### #%% ROS + RUS Combined: Numerical + categorical # mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC # , target = y_rouC # , var_type = 'mixed' # , skf_cv = skf_cv # , blind_test_input_df = X_bts # , blind_test_target = y_bts) # rouC_all = pd.DataFrame(mm_skf_scoresD8) # rouC_all = rouC_all.T # rouC_CT = rouC_all.filter(like='test_', axis=1) # rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) # rouC_BT = rouC_all.filter(like='bts_', axis=1) # rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) # # Write csv # rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv') # rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv') #================ # Baseline # ROUC #================ # other data dependent variables training_size_rouC = len(X_rouC) n_features = len(X_rouC.columns) rouC_scores_mmD = MultModelsCl(input_df = X_rouC , target = y_rouC , var_type = 'mixed' , skf_cv = skf_cv , blind_test_input_df = X_bts , blind_test_target = y_bts , add_cm = True , add_yn = True) rouC_all_scores = pd.DataFrame(rouC_scores_mmD) rouC_all = rouC_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0) rouC_all = rouC_all.reset_index() rouC_all.rename(columns = {'index': 'original_names'}, inplace = True) # Indicate whether BT or CT bt_pattern = re.compile(r'bts_.*') rouC_all['data_source'] = rouC_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1) rouC_all['score_type'] = rouC_all['original_names'].str.replace('bts_|test_', '', regex = True) score_type_uniqueN = set(rouC_all['score_type']) cL1 = list(score_type_ordermapD.keys()) cL2 = list(score_type_uniqueN) if set(cL1).issubset(cL2): print('\nPASS: sorting df by score that is mapped onto the order I want') rouC_all['score_order'] = rouC_all['score_type'].map(score_type_ordermapD) rouC_all.sort_values(by = ['data_source', 'score_order'], ascending = [True, True], inplace = True) else: sys.exit('\nFAIL: could not sort df as score mapping for ordering failed') # add cols: specific rouC_all['resampling'] = 'rouC' rouC_all['training_size'] = training_size_rouC # add cols: common rouC_all['n_features'] = n_features #rouC_all['test_size'] = bts_size #rouC_all['tts_split'] = tts_split ############################################################################### #%% COMBINING all FG dfs #================ # Combine all # https://stackoverflow.com/questions/39862654/pandas-concat-of-multiple-data-frames-using-only-common-columns #================ dfs_combine = [baseline_all, smnc_all, ros_all, rus_all, rouC_all ] dfs_nrows = [] for df in dfs_combine: dfs_nrows = dfs_nrows + [len(df)] dfs_nrows = max(dfs_nrows) dfs_ncols = [] for df in dfs_combine: dfs_ncols = dfs_ncols + [len(df.columns)] dfs_ncols = max(dfs_ncols) # dfs_ncols = [] # dfs_ncols2 = mode(dfs_ncols.append(len(df.columns) for df in dfs_combine) # dfs_ncols2 expected_nrows = len(dfs_combine) * dfs_nrows expected_ncols = dfs_ncols common_cols = list(set.intersection(*(set(df.columns) for df in dfs_combine))) if len(common_cols) == dfs_ncols : combined_baseline = pd.concat([df[common_cols] for df in dfs_combine], ignore_index=True) resampling_methods = combined_baseline[['resampling', 'training_size']] resampling_methods = resampling_methods.drop_duplicates() print('\nConcatenating dfs with different resampling methods:', tts_split , '\nNo. of dfs combining:', len(dfs_combine) , '\nThe sampling methods are:' , '\n', resampling_methods) if len(combined_baseline) == expected_nrows and len(combined_baseline.columns) == expected_ncols: print('\nPASS:', len(dfs_combine), 'dfs successfully combined' , '\nnrows in combined_df:', len(combined_baseline) , '\nncols in combined_df:', len(combined_baseline.columns)) else: print('\nFAIL: concatenating failed' , '\nExpected nrows:', expected_nrows , '\nGot:', len(combined_baseline) , '\nExpected ncols:', expected_ncols , '\nGot:', len(combined_baseline.columns)) sys.exit() else: sys.exit('\nConcatenting dfs not possible,check numbers ') # Add further column indications combined_baseline['test_size'] = bts_size combined_baseline['tts_split'] = tts_split # TODO: # ADD y target ration for all # # rpow bind # if all(ll((baseline_all.columns == baseline_GN.columns == baseline_STR.columns)): # print('\nPASS:colnames match, proceeding to rowbind') # comb_df = pd.concat()], axis = 0, ignore_index = True ) combined_baseline ############################################################################### #==================== # Write output file #==================== combined_baseline.to_csv(outFile, index = False) print('\nFile successfully written:', outFile) ###############################################################################