499 lines
No EOL
18 KiB
Python
499 lines
No EOL
18 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
Created on Mon Jun 20 13:05:23 2022
|
|
|
|
@author: tanu
|
|
"""
|
|
import re
|
|
import argparse
|
|
###############################################################################
|
|
# gene = 'pncA'
|
|
# drug = 'pyrazinamide'
|
|
#total_mtblineage_uc = 8
|
|
|
|
#%% command line args: case sensitive
|
|
arg_parser = argparse.ArgumentParser()
|
|
arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '')
|
|
arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '')
|
|
args = arg_parser.parse_args()
|
|
|
|
drug = args.drug
|
|
gene = args.gene
|
|
|
|
###############################################################################
|
|
#==================
|
|
# other vars
|
|
#==================
|
|
tts_split = '70/30'
|
|
OutFile_suffix = '7030'
|
|
###############################################################################
|
|
#==================
|
|
# Import data
|
|
#==================
|
|
from ml_data_7030 import *
|
|
setvars(gene,drug)
|
|
from ml_data_7030 import *
|
|
|
|
# from YC run_all_ML: run locally
|
|
#from UQ_yc_RunAllClfs import run_all_ML
|
|
|
|
#====================
|
|
# Import ML function
|
|
#====================
|
|
# TT run all ML clfs: baseline model
|
|
from MultModelsCl import MultModelsCl
|
|
|
|
############################################################################
|
|
print('\n#####################################################################\n'
|
|
, '\nRunning ML analysis: feature groups '
|
|
, '\nGene name:', gene
|
|
, '\nDrug name:', drug)
|
|
|
|
#==================
|
|
# Specify outdir
|
|
#==================
|
|
outdir_ml = outdir + 'ml/tts_7030/'
|
|
print('\nOutput directory:', outdir_ml)
|
|
outFile = outdir_ml + gene.lower() + '_baselineC_' + OutFile_suffix + '.csv'
|
|
|
|
###############################################################################
|
|
score_type_ordermapD = { 'mcc' : 1
|
|
, 'fscore' : 2
|
|
, 'jcc' : 3
|
|
, 'precision' : 4
|
|
, 'recall' : 5
|
|
, 'accuracy' : 6
|
|
, 'roc_auc' : 7
|
|
, 'TN' : 8
|
|
, 'FP' : 9
|
|
, 'FN' : 10
|
|
, 'TP' : 11
|
|
, 'trainingY_neg': 12
|
|
, 'trainingY_pos': 13
|
|
, 'blindY_neg' : 14
|
|
, 'blindY_pos' : 15
|
|
, 'fit_time' : 16
|
|
, 'score_time' : 17
|
|
}
|
|
|
|
# data dependent variable
|
|
bts_size = len(X_bts)
|
|
###############################################################################
|
|
#%% TTS: 7030 split
|
|
# mm_skf_scoresD = MultModelsCl(input_df = X
|
|
# , target = y
|
|
# , var_type = 'mixed'
|
|
# , skf_cv = skf_cv
|
|
# , blind_test_input_df = X_bts
|
|
# , blind_test_target = y_bts)
|
|
|
|
# baseline_all = pd.DataFrame(mm_skf_scoresD)
|
|
# baseline_all = baseline_all.T
|
|
# #baseline_train = baseline_all.filter(like='train_', axis=1)
|
|
# baseline_CT = baseline_all.filter(like='test_', axis=1)
|
|
# baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
|
|
|
# baseline_BT = baseline_all.filter(like='bts_', axis=1)
|
|
# baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
|
|
|
# # Write csv
|
|
# baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
|
|
# baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
|
|
|
|
#================
|
|
# Baseline
|
|
# No resampling
|
|
#================
|
|
# other data dependent variables
|
|
training_size_ns = len(X)
|
|
n_features = len(X.columns)
|
|
|
|
scores_mmD = MultModelsCl(input_df = X
|
|
, target = y
|
|
, var_type = 'mixed'
|
|
, skf_cv = skf_cv
|
|
, blind_test_input_df = X_bts
|
|
, blind_test_target = y_bts
|
|
, add_cm = True
|
|
, add_yn = True)
|
|
|
|
baseline_all_scores = pd.DataFrame(scores_mmD)
|
|
|
|
baseline_all = baseline_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
|
|
baseline_all = baseline_all.reset_index()
|
|
baseline_all.rename(columns = {'index': 'original_names'}, inplace = True)
|
|
|
|
# Indicate whether BT or CT
|
|
bt_pattern = re.compile(r'bts_.*')
|
|
baseline_all['data_source'] = baseline_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
|
|
|
|
baseline_all['score_type'] = baseline_all['original_names'].str.replace('bts_|test_', '', regex = True)
|
|
|
|
score_type_uniqueN = set(baseline_all['score_type'])
|
|
cL1 = list(score_type_ordermapD.keys())
|
|
cL2 = list(score_type_uniqueN)
|
|
|
|
if set(cL1).issubset(cL2):
|
|
print('\nPASS: sorting df by score that is mapped onto the order I want')
|
|
baseline_all['score_order'] = baseline_all['score_type'].map(score_type_ordermapD)
|
|
baseline_all.sort_values(by = ['data_source', 'score_order'], ascending = [True, True], inplace = True)
|
|
else:
|
|
sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
|
|
|
|
# add cols: specific
|
|
baseline_all['resampling'] = 'none'
|
|
baseline_all['training_size'] = training_size_ns
|
|
|
|
# add cols: common
|
|
baseline_all['n_features'] = n_features
|
|
#baseline_all['test_size'] = bts_size
|
|
#baseline_all['tts_split'] = tts_split
|
|
|
|
###############################################################################
|
|
#%% SMOTE NC: Oversampling [Numerical + categorical]
|
|
# mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
|
|
# , target = y_smnc
|
|
# , var_type = 'mixed'
|
|
# , skf_cv = skf_cv
|
|
# , blind_test_input_df = X_bts
|
|
# , blind_test_target = y_bts)
|
|
# smnc_all = pd.DataFrame(mm_skf_scoresD7)
|
|
# smnc_all = smnc_all.T
|
|
|
|
# smnc_CT = smnc_all.filter(like='test_', axis=1)
|
|
# smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
|
|
|
# smnc_BT = smnc_all.filter(like='bts_', axis=1)
|
|
# smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
|
|
|
# # Write csv
|
|
# smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
|
|
# smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
|
|
|
|
#================
|
|
# Baselone
|
|
# SMOTE NC
|
|
#================
|
|
# other data dependent variables
|
|
training_size_smnc = len(X_smnc)
|
|
n_features = len(X_smnc.columns)
|
|
|
|
smnc_scores_mmD = MultModelsCl(input_df = X_smnc
|
|
, target = y_smnc
|
|
, var_type = 'mixed'
|
|
, skf_cv = skf_cv
|
|
, blind_test_input_df = X_bts
|
|
, blind_test_target = y_bts
|
|
, add_cm = True
|
|
, add_yn = True)
|
|
|
|
smnc_all_scores = pd.DataFrame(smnc_scores_mmD)
|
|
|
|
smnc_all = smnc_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
|
|
smnc_all = smnc_all.reset_index()
|
|
smnc_all.rename(columns = {'index': 'original_names'}, inplace = True)
|
|
|
|
# Indicate whether BT or CT
|
|
bt_pattern = re.compile(r'bts_.*')
|
|
smnc_all['data_source'] = smnc_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
|
|
|
|
smnc_all['score_type'] = smnc_all['original_names'].str.replace('bts_|test_', '', regex = True)
|
|
|
|
score_type_uniqueN = set(smnc_all['score_type'])
|
|
cL1 = list(score_type_ordermapD.keys())
|
|
cL2 = list(score_type_uniqueN)
|
|
|
|
if set(cL1).issubset(cL2):
|
|
print('\nPASS: sorting df by score that is mapped onto the order I want')
|
|
smnc_all['score_order'] = smnc_all['score_type'].map(score_type_ordermapD)
|
|
smnc_all.sort_values(by = ['data_source', 'score_order'], ascending = [True, True], inplace = True)
|
|
else:
|
|
sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
|
|
|
|
# add cols: specific
|
|
smnc_all['resampling'] = 'smnc'
|
|
smnc_all['training_size'] = training_size_smnc
|
|
|
|
# add cols: common
|
|
smnc_all['n_features'] = n_features
|
|
#smnc_all['test_size'] = bts_size
|
|
#smnc_all['tts_split'] = tts_split
|
|
###############################################################################
|
|
#%% ROS: Numerical + categorical
|
|
# mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
|
|
# , target = y_ros
|
|
# , var_type = 'mixed'
|
|
# , skf_cv = skf_cv
|
|
# , blind_test_input_df = X_bts
|
|
# , blind_test_target = y_bts)
|
|
# ros_all = pd.DataFrame(mm_skf_scoresD3)
|
|
# ros_all = ros_all.T
|
|
|
|
# ros_CT = ros_all.filter(like='test_', axis=1)
|
|
# ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
|
|
|
# ros_BT = ros_all.filter(like='bts_', axis=1)
|
|
# ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
|
|
|
# # Write csv
|
|
# ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
|
|
# ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
|
|
#================
|
|
# Baseline
|
|
# ROS
|
|
#================
|
|
# other data dependent variables
|
|
training_size_ros = len(X_ros)
|
|
n_features = len(X_ros.columns)
|
|
|
|
ros_scores_mmD = MultModelsCl(input_df = X_ros
|
|
, target = y_ros
|
|
, var_type = 'mixed'
|
|
, skf_cv = skf_cv
|
|
, blind_test_input_df = X_bts
|
|
, blind_test_target = y_bts
|
|
, add_cm = True
|
|
, add_yn = True)
|
|
|
|
ros_all_scores = pd.DataFrame(ros_scores_mmD)
|
|
|
|
ros_all = ros_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
|
|
ros_all = ros_all.reset_index()
|
|
ros_all.rename(columns = {'index': 'original_names'}, inplace = True)
|
|
|
|
# Indicate whether BT or CT
|
|
bt_pattern = re.compile(r'bts_.*')
|
|
ros_all['data_source'] = ros_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
|
|
|
|
ros_all['score_type'] = ros_all['original_names'].str.replace('bts_|test_', '', regex = True)
|
|
|
|
score_type_uniqueN = set(ros_all['score_type'])
|
|
cL1 = list(score_type_ordermapD.keys())
|
|
cL2 = list(score_type_uniqueN)
|
|
|
|
if set(cL1).issubset(cL2):
|
|
print('\nPASS: sorting df by score that is mapped onto the order I want')
|
|
ros_all['score_order'] = ros_all['score_type'].map(score_type_ordermapD)
|
|
ros_all.sort_values(by = ['data_source', 'score_order'], ascending = [True, True], inplace = True)
|
|
else:
|
|
sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
|
|
|
|
# add cols: specific
|
|
ros_all['resampling'] = 'ros'
|
|
ros_all['training_size'] = training_size_ros
|
|
|
|
# add cols: common
|
|
ros_all['n_features'] = n_features
|
|
#ros_all['test_size'] = bts_size
|
|
#ros_all['tts_split'] = tts_split
|
|
###############################################################################
|
|
#%% RUS: Numerical + categorical
|
|
# mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
|
|
# , target = y_rus
|
|
# , var_type = 'mixed'
|
|
# , skf_cv = skf_cv
|
|
# , blind_test_input_df = X_bts
|
|
# , blind_test_target = y_bts)
|
|
# rus_all = pd.DataFrame(mm_skf_scoresD4)
|
|
# rus_all = rus_all.T
|
|
|
|
# rus_CT = rus_all.filter(like='test_', axis=1)
|
|
# rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
|
|
|
# rus_BT = rus_all.filter(like='bts_' , axis=1)
|
|
# rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
|
|
|
# # Write csv
|
|
# rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
|
|
# rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
|
|
|
|
#================
|
|
# Baseline
|
|
# RUS
|
|
#================
|
|
# other data dependent variables
|
|
training_size_rus = len(X_rus)
|
|
n_features = len(X_rus.columns)
|
|
|
|
rus_scores_mmD = MultModelsCl(input_df = X_rus
|
|
, target = y_rus
|
|
, var_type = 'mixed'
|
|
, skf_cv = skf_cv
|
|
, blind_test_input_df = X_bts
|
|
, blind_test_target = y_bts
|
|
, add_cm = True
|
|
, add_yn = True)
|
|
|
|
rus_all_scores = pd.DataFrame(rus_scores_mmD)
|
|
|
|
rus_all = rus_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
|
|
rus_all = rus_all.reset_index()
|
|
rus_all.rename(columns = {'index': 'original_names'}, inplace = True)
|
|
|
|
# Indicate whether BT or CT
|
|
bt_pattern = re.compile(r'bts_.*')
|
|
rus_all['data_source'] = rus_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
|
|
|
|
rus_all['score_type'] = rus_all['original_names'].str.replace('bts_|test_', '', regex = True)
|
|
|
|
score_type_uniqueN = set(rus_all['score_type'])
|
|
cL1 = list(score_type_ordermapD.keys())
|
|
cL2 = list(score_type_uniqueN)
|
|
|
|
if set(cL1).issubset(cL2):
|
|
print('\nPASS: sorting df by score that is mapped onto the order I want')
|
|
rus_all['score_order'] = rus_all['score_type'].map(score_type_ordermapD)
|
|
rus_all.sort_values(by = ['data_source', 'score_order'], ascending = [True, True], inplace = True)
|
|
else:
|
|
sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
|
|
|
|
# add cols: specific
|
|
rus_all['resampling'] = 'rus'
|
|
rus_all['training_size'] = training_size_rus
|
|
|
|
# add cols: common
|
|
rus_all['n_features'] = n_features
|
|
#rus_all['test_size'] = bts_size
|
|
#rus_all['tts_split'] = tts_split
|
|
###############################################################################
|
|
#%% ROS + RUS Combined: Numerical + categorical
|
|
# mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
|
|
# , target = y_rouC
|
|
# , var_type = 'mixed'
|
|
# , skf_cv = skf_cv
|
|
# , blind_test_input_df = X_bts
|
|
# , blind_test_target = y_bts)
|
|
# rouC_all = pd.DataFrame(mm_skf_scoresD8)
|
|
# rouC_all = rouC_all.T
|
|
|
|
# rouC_CT = rouC_all.filter(like='test_', axis=1)
|
|
# rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
|
|
|
# rouC_BT = rouC_all.filter(like='bts_', axis=1)
|
|
# rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
|
|
|
# # Write csv
|
|
# rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
|
|
# rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')
|
|
|
|
#================
|
|
# Baseline
|
|
# ROUC
|
|
#================
|
|
# other data dependent variables
|
|
training_size_rouC = len(X_rouC)
|
|
n_features = len(X_rouC.columns)
|
|
|
|
rouC_scores_mmD = MultModelsCl(input_df = X_rouC
|
|
, target = y_rouC
|
|
, var_type = 'mixed'
|
|
, skf_cv = skf_cv
|
|
, blind_test_input_df = X_bts
|
|
, blind_test_target = y_bts
|
|
, add_cm = True
|
|
, add_yn = True)
|
|
|
|
rouC_all_scores = pd.DataFrame(rouC_scores_mmD)
|
|
|
|
rouC_all = rouC_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
|
|
rouC_all = rouC_all.reset_index()
|
|
rouC_all.rename(columns = {'index': 'original_names'}, inplace = True)
|
|
|
|
# Indicate whether BT or CT
|
|
bt_pattern = re.compile(r'bts_.*')
|
|
rouC_all['data_source'] = rouC_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
|
|
|
|
rouC_all['score_type'] = rouC_all['original_names'].str.replace('bts_|test_', '', regex = True)
|
|
|
|
score_type_uniqueN = set(rouC_all['score_type'])
|
|
cL1 = list(score_type_ordermapD.keys())
|
|
cL2 = list(score_type_uniqueN)
|
|
|
|
if set(cL1).issubset(cL2):
|
|
print('\nPASS: sorting df by score that is mapped onto the order I want')
|
|
rouC_all['score_order'] = rouC_all['score_type'].map(score_type_ordermapD)
|
|
rouC_all.sort_values(by = ['data_source', 'score_order'], ascending = [True, True], inplace = True)
|
|
else:
|
|
sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
|
|
|
|
# add cols: specific
|
|
rouC_all['resampling'] = 'rouC'
|
|
rouC_all['training_size'] = training_size_rouC
|
|
|
|
# add cols: common
|
|
rouC_all['n_features'] = n_features
|
|
#rouC_all['test_size'] = bts_size
|
|
#rouC_all['tts_split'] = tts_split
|
|
|
|
|
|
|
|
|
|
|
|
###############################################################################
|
|
#%% COMBINING all FG dfs
|
|
#================
|
|
# Combine all
|
|
# https://stackoverflow.com/questions/39862654/pandas-concat-of-multiple-data-frames-using-only-common-columns
|
|
#================
|
|
dfs_combine = [baseline_all, smnc_all, ros_all, rus_all, rouC_all ]
|
|
|
|
dfs_nrows = []
|
|
for df in dfs_combine:
|
|
dfs_nrows = dfs_nrows + [len(df)]
|
|
dfs_nrows = max(dfs_nrows)
|
|
|
|
dfs_ncols = []
|
|
for df in dfs_combine:
|
|
dfs_ncols = dfs_ncols + [len(df.columns)]
|
|
dfs_ncols = max(dfs_ncols)
|
|
|
|
# dfs_ncols = []
|
|
# dfs_ncols2 = mode(dfs_ncols.append(len(df.columns) for df in dfs_combine)
|
|
# dfs_ncols2
|
|
|
|
expected_nrows = len(dfs_combine) * dfs_nrows
|
|
expected_ncols = dfs_ncols
|
|
|
|
common_cols = list(set.intersection(*(set(df.columns) for df in dfs_combine)))
|
|
|
|
if len(common_cols) == dfs_ncols :
|
|
combined_baseline = pd.concat([df[common_cols] for df in dfs_combine], ignore_index=True)
|
|
resampling_methods = combined_baseline[['resampling', 'training_size']]
|
|
resampling_methods = resampling_methods.drop_duplicates()
|
|
print('\nConcatenating dfs with different resampling methods:', tts_split
|
|
, '\nNo. of dfs combining:', len(dfs_combine)
|
|
, '\nThe sampling methods are:'
|
|
, '\n', resampling_methods)
|
|
if len(combined_baseline) == expected_nrows and len(combined_baseline.columns) == expected_ncols:
|
|
print('\nPASS:', len(dfs_combine), 'dfs successfully combined'
|
|
, '\nnrows in combined_df:', len(combined_baseline)
|
|
, '\nncols in combined_df:', len(combined_baseline.columns))
|
|
else:
|
|
print('\nFAIL: concatenating failed'
|
|
, '\nExpected nrows:', expected_nrows
|
|
, '\nGot:', len(combined_baseline)
|
|
, '\nExpected ncols:', expected_ncols
|
|
, '\nGot:', len(combined_baseline.columns))
|
|
sys.exit()
|
|
else:
|
|
sys.exit('\nConcatenting dfs not possible,check numbers ')
|
|
|
|
# Add further column indications
|
|
combined_baseline['test_size'] = bts_size
|
|
combined_baseline['tts_split'] = tts_split
|
|
|
|
# TODO:
|
|
# ADD y target ration for all
|
|
|
|
# # rpow bind
|
|
# if all(ll((baseline_all.columns == baseline_GN.columns == baseline_STR.columns)):
|
|
# print('\nPASS:colnames match, proceeding to rowbind')
|
|
# comb_df = pd.concat()], axis = 0, ignore_index = True ) combined_baseline
|
|
###############################################################################
|
|
#====================
|
|
# Write output file
|
|
#====================
|
|
combined_baseline.to_csv(outFile, index = False)
|
|
print('\nFile successfully written:', outFile)
|
|
############################################################################### |