changed blind_test_input_df to blind_test_df in MultModelsCl
This commit is contained in:
parent
bc12dbd7c2
commit
0350784d52
114 changed files with 107251 additions and 863011 deletions
490
scripts/ml/run_7030.py
Normal file → Executable file
490
scripts/ml/run_7030.py
Normal file → Executable file
|
@ -55,7 +55,9 @@ print('\n#####################################################################\n
|
|||
#==================
|
||||
outdir_ml = outdir + 'ml/tts_7030/'
|
||||
print('\nOutput directory:', outdir_ml)
|
||||
outFile = outdir_ml + gene.lower() + '_baselineC_' + OutFile_suffix + '.csv'
|
||||
|
||||
outFile_wf = outdir_ml + gene.lower() + '_baselineC_' + OutFile_suffix + '.csv'
|
||||
outFile_lf = outdir_ml + gene.lower() + '_baselineC_ext_' + OutFile_suffix + '.csv'
|
||||
|
||||
###############################################################################
|
||||
score_type_ordermapD = { 'mcc' : 1
|
||||
|
@ -77,38 +79,34 @@ score_type_ordermapD = { 'mcc' : 1
|
|||
, 'score_time' : 17
|
||||
}
|
||||
|
||||
# data dependent variable
|
||||
bts_size = len(X_bts)
|
||||
scoreCV_mapD = {'test_mcc' : 'MCC'
|
||||
, 'test_fscore' : 'F1'
|
||||
, 'test_precision' : 'Precision'
|
||||
, 'test_recall' : 'Recall'
|
||||
, 'test_accuracy' : 'Accuracy'
|
||||
, 'test_roc_auc' : 'ROC_AUC'
|
||||
, 'test_jcc' : 'JCC'
|
||||
}
|
||||
|
||||
scoreBT_mapD = {'bts_mcc' : 'MCC'
|
||||
, 'bts_fscore' : 'F1'
|
||||
, 'bts_precision' : 'Precision'
|
||||
, 'bts_recall' : 'Recall'
|
||||
, 'bts_accuracy' : 'Accuracy'
|
||||
, 'bts_roc_auc' : 'ROC_AUC'
|
||||
, 'bts_jcc' : 'JCC'
|
||||
}
|
||||
|
||||
# data dependent variables but NOT dependent on resampling
|
||||
bts_size = len(X_bts)
|
||||
yc2 = Counter(y_bts)
|
||||
yc2_ratio = yc2[0]/yc2[1]
|
||||
###############################################################################
|
||||
#%% TTS: 7030 split
|
||||
# mm_skf_scoresD = MultModelsCl(input_df = X
|
||||
# , target = y
|
||||
# , var_type = 'mixed'
|
||||
# , skf_cv = skf_cv
|
||||
# , blind_test_input_df = X_bts
|
||||
# , blind_test_target = y_bts)
|
||||
|
||||
# baseline_all = pd.DataFrame(mm_skf_scoresD)
|
||||
# baseline_all = baseline_all.T
|
||||
# #baseline_train = baseline_all.filter(like='train_', axis=1)
|
||||
# baseline_CT = baseline_all.filter(like='test_', axis=1)
|
||||
# baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# baseline_BT = baseline_all.filter(like='bts_', axis=1)
|
||||
# baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# # Write csv
|
||||
# baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
|
||||
# baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
|
||||
|
||||
#%% Basic: No Oversampling
|
||||
#================
|
||||
# Baseline
|
||||
# No resampling
|
||||
#================
|
||||
# other data dependent variables
|
||||
training_size_ns = len(X)
|
||||
n_features = len(X.columns)
|
||||
|
||||
scores_mmD = MultModelsCl(input_df = X
|
||||
, target = y
|
||||
, var_type = 'mixed'
|
||||
|
@ -119,14 +117,47 @@ scores_mmD = MultModelsCl(input_df = X
|
|||
, add_yn = True)
|
||||
|
||||
baseline_all_scores = pd.DataFrame(scores_mmD)
|
||||
rs_none = 'none'
|
||||
#------------------------
|
||||
# WF: only CV and BTS
|
||||
#-----------------------
|
||||
baseline_allT = baseline_all_scores.T
|
||||
#baseline_train = baseline_all.filter(regex='train_', axis=1)
|
||||
|
||||
baseline_CV = baseline_allT.filter(regex='test_', axis = 1); baseline_CV.columns
|
||||
# map colnames for consistency to allow concatenting
|
||||
baseline_CV.columns = baseline_CV.columns.map(scoreCV_mapD); baseline_CV.columns
|
||||
baseline_CV['Data_source'] = 'CV'
|
||||
baseline_CV['Resampling'] = rs_none
|
||||
|
||||
baseline_BT = baseline_allT.filter(regex='bts_', axis = 1); baseline_BT.columns
|
||||
# map colnames for consistency to allow concatenting
|
||||
baseline_BT.columns = baseline_BT.columns.map(scoreBT_mapD); baseline_BT.columns
|
||||
baseline_BT['Data_source'] = 'BT'
|
||||
baseline_BT['Resampling'] = rs_none
|
||||
|
||||
# # Write csv
|
||||
#baseline_CV.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
#baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
# baseline_CV.to_csv(outdir_ml + gene.lower() + '_baseline_CV_allF.csv')
|
||||
# baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
|
||||
|
||||
#----------------------------------
|
||||
# LF*: CV + BTS + Other info
|
||||
#-----------------------------------
|
||||
# other data dependent variables
|
||||
training_size_ns = len(X)
|
||||
n_features = len(X.columns)
|
||||
yc1 = Counter(y)
|
||||
yc1_ratio = yc1[0]/yc1[1]
|
||||
|
||||
baseline_all = baseline_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
|
||||
baseline_all = baseline_all.reset_index()
|
||||
baseline_all.rename(columns = {'index': 'original_names'}, inplace = True)
|
||||
|
||||
# Indicate whether BT or CT
|
||||
# Indicate whether BT or CV
|
||||
bt_pattern = re.compile(r'bts_.*')
|
||||
baseline_all['data_source'] = baseline_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
|
||||
baseline_all['Data_source'] = baseline_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
|
||||
|
||||
baseline_all['score_type'] = baseline_all['original_names'].str.replace('bts_|test_', '', regex = True)
|
||||
|
||||
|
@ -137,48 +168,22 @@ cL2 = list(score_type_uniqueN)
|
|||
if set(cL1).issubset(cL2):
|
||||
print('\nPASS: sorting df by score that is mapped onto the order I want')
|
||||
baseline_all['score_order'] = baseline_all['score_type'].map(score_type_ordermapD)
|
||||
baseline_all.sort_values(by = ['data_source', 'score_order'], ascending = [True, True], inplace = True)
|
||||
baseline_all.sort_values(by = ['Data_source', 'score_order'], ascending = [True, True], inplace = True)
|
||||
else:
|
||||
sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
|
||||
|
||||
# add cols: specific
|
||||
baseline_all['resampling'] = 'none'
|
||||
baseline_all['Resampling'] = rs_none
|
||||
baseline_all['training_size'] = training_size_ns
|
||||
|
||||
# add cols: common
|
||||
baseline_all['n_features'] = n_features
|
||||
#baseline_all['test_size'] = bts_size
|
||||
#baseline_all['tts_split'] = tts_split
|
||||
baseline_all['trainingY_ratio']= round(yc1_ratio,2)
|
||||
baseline_all['n_features'] = n_features
|
||||
|
||||
###############################################################################
|
||||
#%% SMOTE NC: Oversampling [Numerical + categorical]
|
||||
# mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
|
||||
# , target = y_smnc
|
||||
# , var_type = 'mixed'
|
||||
# , skf_cv = skf_cv
|
||||
# , blind_test_input_df = X_bts
|
||||
# , blind_test_target = y_bts)
|
||||
# smnc_all = pd.DataFrame(mm_skf_scoresD7)
|
||||
# smnc_all = smnc_all.T
|
||||
|
||||
# smnc_CT = smnc_all.filter(like='test_', axis=1)
|
||||
# smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# smnc_BT = smnc_all.filter(like='bts_', axis=1)
|
||||
# smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# # Write csv
|
||||
# smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
|
||||
# smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
|
||||
|
||||
#%% SMOTE NC: Smote Oversampling [Numerical + categorical]
|
||||
#================
|
||||
# Baselone
|
||||
# SMOTE NC
|
||||
# Baseline
|
||||
# SMOTE NC: SMNC
|
||||
#================
|
||||
# other data dependent variables
|
||||
training_size_smnc = len(X_smnc)
|
||||
n_features = len(X_smnc.columns)
|
||||
|
||||
smnc_scores_mmD = MultModelsCl(input_df = X_smnc
|
||||
, target = y_smnc
|
||||
, var_type = 'mixed'
|
||||
|
@ -189,14 +194,46 @@ smnc_scores_mmD = MultModelsCl(input_df = X_smnc
|
|||
, add_yn = True)
|
||||
|
||||
smnc_all_scores = pd.DataFrame(smnc_scores_mmD)
|
||||
rs_smnc = 'smnc'
|
||||
#------------------------
|
||||
# WF: only CV and BTS
|
||||
#-----------------------
|
||||
smnc_allT = smnc_all_scores.T
|
||||
|
||||
smnc_CV = smnc_allT.filter(regex='test_', axis = 1); smnc_CV.columns
|
||||
# map colnames for consistency to allow concatenting
|
||||
smnc_CV.columns = smnc_CV.columns.map(scoreCV_mapD); smnc_CV.columns
|
||||
smnc_CV['Data_source'] = 'CV'
|
||||
smnc_CV['Resampling'] = rs_smnc
|
||||
|
||||
smnc_BT = smnc_allT.filter(regex='bts_', axis = 1); smnc_BT.columns
|
||||
# map colnames for consistency to allow concatenting
|
||||
smnc_BT.columns = smnc_BT.columns.map(scoreBT_mapD); smnc_BT.columns
|
||||
smnc_BT['Data_source'] = 'BT'
|
||||
smnc_BT['Resampling'] = rs_smnc
|
||||
|
||||
# Write csv
|
||||
# smnc_CV.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
# smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
# smnc_CV.to_csv(outdir_ml + gene.lower() + '_smnc_CV_allF.csv')
|
||||
# smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
|
||||
|
||||
#----------------------------------
|
||||
# LF*: CV + BTS + Other info
|
||||
#-----------------------------------
|
||||
# other data dependent variables
|
||||
training_size_smnc = len(X_smnc)
|
||||
n_features = len(X_smnc.columns)
|
||||
yc1_smnc = Counter(y_smnc)
|
||||
yc1_ratio_smnc = yc1_smnc[0]/yc1_smnc[1]
|
||||
|
||||
smnc_all = smnc_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
|
||||
smnc_all = smnc_all.reset_index()
|
||||
smnc_all.rename(columns = {'index': 'original_names'}, inplace = True)
|
||||
|
||||
# Indicate whether BT or CT
|
||||
# Indicate whether BT or CV
|
||||
bt_pattern = re.compile(r'bts_.*')
|
||||
smnc_all['data_source'] = smnc_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
|
||||
smnc_all['Data_source'] = smnc_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
|
||||
|
||||
smnc_all['score_type'] = smnc_all['original_names'].str.replace('bts_|test_', '', regex = True)
|
||||
|
||||
|
@ -207,46 +244,22 @@ cL2 = list(score_type_uniqueN)
|
|||
if set(cL1).issubset(cL2):
|
||||
print('\nPASS: sorting df by score that is mapped onto the order I want')
|
||||
smnc_all['score_order'] = smnc_all['score_type'].map(score_type_ordermapD)
|
||||
smnc_all.sort_values(by = ['data_source', 'score_order'], ascending = [True, True], inplace = True)
|
||||
smnc_all.sort_values(by = ['Data_source', 'score_order'], ascending = [True, True], inplace = True)
|
||||
else:
|
||||
sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
|
||||
|
||||
# add cols: specific
|
||||
smnc_all['resampling'] = 'smnc'
|
||||
smnc_all['training_size'] = training_size_smnc
|
||||
smnc_all['Resampling'] = rs_smnc
|
||||
smnc_all['training_size'] = training_size_smnc
|
||||
smnc_all['trainingY_ratio'] = round(yc1_ratio_smnc,2)
|
||||
smnc_all['n_features'] = n_features
|
||||
|
||||
# add cols: common
|
||||
smnc_all['n_features'] = n_features
|
||||
#smnc_all['test_size'] = bts_size
|
||||
#smnc_all['tts_split'] = tts_split
|
||||
###############################################################################
|
||||
#%% ROS: Numerical + categorical
|
||||
# mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
|
||||
# , target = y_ros
|
||||
# , var_type = 'mixed'
|
||||
# , skf_cv = skf_cv
|
||||
# , blind_test_input_df = X_bts
|
||||
# , blind_test_target = y_bts)
|
||||
# ros_all = pd.DataFrame(mm_skf_scoresD3)
|
||||
# ros_all = ros_all.T
|
||||
|
||||
# ros_CT = ros_all.filter(like='test_', axis=1)
|
||||
# ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# ros_BT = ros_all.filter(like='bts_', axis=1)
|
||||
# ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# # Write csv
|
||||
# ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
|
||||
# ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
|
||||
#%% ROS: Random Over Sampling [Numerical + categorical]
|
||||
#================
|
||||
# Baseline
|
||||
# ROS
|
||||
#================
|
||||
# other data dependent variables
|
||||
training_size_ros = len(X_ros)
|
||||
n_features = len(X_ros.columns)
|
||||
|
||||
ros_scores_mmD = MultModelsCl(input_df = X_ros
|
||||
, target = y_ros
|
||||
, var_type = 'mixed'
|
||||
|
@ -257,14 +270,45 @@ ros_scores_mmD = MultModelsCl(input_df = X_ros
|
|||
, add_yn = True)
|
||||
|
||||
ros_all_scores = pd.DataFrame(ros_scores_mmD)
|
||||
rs_ros = 'ros'
|
||||
#------------------------
|
||||
# WF: only CV and BTS
|
||||
#-----------------------
|
||||
ros_allT = ros_all_scores.T
|
||||
|
||||
ros_CV = ros_allT.filter(regex='test_', axis = 1); ros_CV.columns
|
||||
# map colnames for consistency to allow concatenting
|
||||
ros_CV.columns = ros_CV.columns.map(scoreCV_mapD); ros_CV.columns
|
||||
ros_CV['Data_source'] = 'CV'
|
||||
ros_CV['Resampling'] = rs_ros
|
||||
|
||||
ros_BT = ros_allT.filter(regex='bts_', axis = 1); ros_BT.columns
|
||||
# map colnames for consistency to allow concatenting
|
||||
ros_BT.columns = ros_BT.columns.map(scoreBT_mapD); ros_BT.columns
|
||||
ros_BT['Data_source'] = 'BT'
|
||||
ros_BT['Resampling'] = rs_ros
|
||||
|
||||
# Write csv
|
||||
# ros_CV.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
# ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
# ros_CV.to_csv(outdir_ml + gene.lower() + '_ros_CV_allF.csv')
|
||||
# ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
|
||||
#----------------------------------
|
||||
# LF*: CV + BTS + Other info
|
||||
#----------------------------------
|
||||
# other data dependent variables
|
||||
training_size_ros = len(X_ros)
|
||||
n_features = len(X_ros.columns)
|
||||
yc1_ros = Counter(y_ros)
|
||||
yc1_ratio_ros = yc1_ros[0]/yc1_ros[1]
|
||||
|
||||
ros_all = ros_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
|
||||
ros_all = ros_all.reset_index()
|
||||
ros_all.rename(columns = {'index': 'original_names'}, inplace = True)
|
||||
|
||||
# Indicate whether BT or CT
|
||||
# Indicate whether BT or CV
|
||||
bt_pattern = re.compile(r'bts_.*')
|
||||
ros_all['data_source'] = ros_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
|
||||
ros_all['Data_source'] = ros_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
|
||||
|
||||
ros_all['score_type'] = ros_all['original_names'].str.replace('bts_|test_', '', regex = True)
|
||||
|
||||
|
@ -275,47 +319,21 @@ cL2 = list(score_type_uniqueN)
|
|||
if set(cL1).issubset(cL2):
|
||||
print('\nPASS: sorting df by score that is mapped onto the order I want')
|
||||
ros_all['score_order'] = ros_all['score_type'].map(score_type_ordermapD)
|
||||
ros_all.sort_values(by = ['data_source', 'score_order'], ascending = [True, True], inplace = True)
|
||||
ros_all.sort_values(by = ['Data_source', 'score_order'], ascending = [True, True], inplace = True)
|
||||
else:
|
||||
sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
|
||||
|
||||
# add cols: specific
|
||||
ros_all['resampling'] = 'ros'
|
||||
ros_all['training_size'] = training_size_ros
|
||||
|
||||
# add cols: common
|
||||
ros_all['n_features'] = n_features
|
||||
#ros_all['test_size'] = bts_size
|
||||
#ros_all['tts_split'] = tts_split
|
||||
ros_all['Resampling'] = rs_ros
|
||||
ros_all['training_size'] = training_size_ros
|
||||
ros_all['trainingY_ratio'] = round(yc1_ratio_ros,2)
|
||||
ros_all['n_features'] = n_features
|
||||
###############################################################################
|
||||
#%% RUS: Numerical + categorical
|
||||
# mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
|
||||
# , target = y_rus
|
||||
# , var_type = 'mixed'
|
||||
# , skf_cv = skf_cv
|
||||
# , blind_test_input_df = X_bts
|
||||
# , blind_test_target = y_bts)
|
||||
# rus_all = pd.DataFrame(mm_skf_scoresD4)
|
||||
# rus_all = rus_all.T
|
||||
|
||||
# rus_CT = rus_all.filter(like='test_', axis=1)
|
||||
# rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# rus_BT = rus_all.filter(like='bts_' , axis=1)
|
||||
# rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# # Write csv
|
||||
# rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
|
||||
# rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
|
||||
|
||||
#%% RUS: Random Under Sampling [Numerical + categorical]
|
||||
#================
|
||||
# Baseline
|
||||
# RUS
|
||||
#================
|
||||
# other data dependent variables
|
||||
training_size_rus = len(X_rus)
|
||||
n_features = len(X_rus.columns)
|
||||
|
||||
rus_scores_mmD = MultModelsCl(input_df = X_rus
|
||||
, target = y_rus
|
||||
, var_type = 'mixed'
|
||||
|
@ -326,14 +344,46 @@ rus_scores_mmD = MultModelsCl(input_df = X_rus
|
|||
, add_yn = True)
|
||||
|
||||
rus_all_scores = pd.DataFrame(rus_scores_mmD)
|
||||
rs_rus = 'rus'
|
||||
#-----------------------
|
||||
# WF: only CV and BTS
|
||||
#-----------------------
|
||||
rus_allT = rus_all_scores.T
|
||||
|
||||
rus_CV = rus_allT.filter(regex='test_', axis=1); rus_CV.columns
|
||||
# map colnames for consistency to allow concatenting
|
||||
rus_CV.columns = rus_CV.columns.map(scoreCV_mapD); rus_CV.columns
|
||||
rus_CV['Data_source'] = 'CV'
|
||||
rus_CV['Resampling'] = rs_rus
|
||||
|
||||
rus_BT = rus_allT.filter(regex='bts_' , axis=1); rus_BT.columns
|
||||
# map colnames for consistency to allow concatenting
|
||||
rus_BT.columns = rus_BT.columns.map(scoreBT_mapD); rus_BT.columns
|
||||
rus_BT['Data_source'] = 'BT'
|
||||
rus_BT['Resampling'] = rs_rus
|
||||
|
||||
# # Write csv
|
||||
# rus_CV.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
# rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
# rus_CV.to_csv(outdir_ml + gene.lower() + '_rus_CV_allF.csv')
|
||||
# rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
|
||||
|
||||
#----------------------------------
|
||||
# LF*: CV + BTS + Other info
|
||||
#----------------------------------
|
||||
# other data dependent variables
|
||||
training_size_rus = len(X_rus)
|
||||
n_features = len(X_rus.columns)
|
||||
yc1_rus = Counter(y_rus)
|
||||
yc1_ratio_rus = yc1_rus[0]/yc1_rus[1]
|
||||
|
||||
rus_all = rus_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
|
||||
rus_all = rus_all.reset_index()
|
||||
rus_all.rename(columns = {'index': 'original_names'}, inplace = True)
|
||||
|
||||
# Indicate whether BT or CT
|
||||
# Indicate whether BT or CV
|
||||
bt_pattern = re.compile(r'bts_.*')
|
||||
rus_all['data_source'] = rus_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
|
||||
rus_all['Data_source'] = rus_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
|
||||
|
||||
rus_all['score_type'] = rus_all['original_names'].str.replace('bts_|test_', '', regex = True)
|
||||
|
||||
|
@ -344,47 +394,22 @@ cL2 = list(score_type_uniqueN)
|
|||
if set(cL1).issubset(cL2):
|
||||
print('\nPASS: sorting df by score that is mapped onto the order I want')
|
||||
rus_all['score_order'] = rus_all['score_type'].map(score_type_ordermapD)
|
||||
rus_all.sort_values(by = ['data_source', 'score_order'], ascending = [True, True], inplace = True)
|
||||
rus_all.sort_values(by = ['Data_source', 'score_order'], ascending = [True, True], inplace = True)
|
||||
else:
|
||||
sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
|
||||
|
||||
# add cols: specific
|
||||
rus_all['resampling'] = 'rus'
|
||||
rus_all['training_size'] = training_size_rus
|
||||
rus_all['Resampling'] = rs_rus
|
||||
rus_all['training_size'] = training_size_rus
|
||||
rus_all['trainingY_ratio'] = round(yc1_ratio_rus,2)
|
||||
rus_all['n_features'] = n_features
|
||||
|
||||
# add cols: common
|
||||
rus_all['n_features'] = n_features
|
||||
#rus_all['test_size'] = bts_size
|
||||
#rus_all['tts_split'] = tts_split
|
||||
###############################################################################
|
||||
#%% ROS + RUS Combined: Numerical + categorical
|
||||
# mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
|
||||
# , target = y_rouC
|
||||
# , var_type = 'mixed'
|
||||
# , skf_cv = skf_cv
|
||||
# , blind_test_input_df = X_bts
|
||||
# , blind_test_target = y_bts)
|
||||
# rouC_all = pd.DataFrame(mm_skf_scoresD8)
|
||||
# rouC_all = rouC_all.T
|
||||
|
||||
# rouC_CT = rouC_all.filter(like='test_', axis=1)
|
||||
# rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# rouC_BT = rouC_all.filter(like='bts_', axis=1)
|
||||
# rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# # Write csv
|
||||
# rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
|
||||
# rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')
|
||||
|
||||
#%% ROS+RUS Combined: [Numerical + categorical]
|
||||
#================
|
||||
# Baseline
|
||||
# ROUC
|
||||
#================
|
||||
# other data dependent variables
|
||||
training_size_rouC = len(X_rouC)
|
||||
n_features = len(X_rouC.columns)
|
||||
|
||||
rouC_scores_mmD = MultModelsCl(input_df = X_rouC
|
||||
, target = y_rouC
|
||||
, var_type = 'mixed'
|
||||
|
@ -395,14 +420,46 @@ rouC_scores_mmD = MultModelsCl(input_df = X_rouC
|
|||
, add_yn = True)
|
||||
|
||||
rouC_all_scores = pd.DataFrame(rouC_scores_mmD)
|
||||
rs_rouC = 'rouC'
|
||||
#-----------------------
|
||||
# WF: only CV and BTS
|
||||
#-----------------------
|
||||
rouC_allT = rouC_all_scores.T
|
||||
|
||||
rouC_CV = rouC_allT.filter(regex='test_', axis=1); rouC_CV.columns
|
||||
# map colnames for consistency to allow concatenting
|
||||
rouC_CV.columns = rouC_CV.columns.map(scoreCV_mapD); rouC_CV.columns
|
||||
rouC_CV['Data_source'] = 'CV'
|
||||
rouC_CV['Resampling'] = rs_rouC
|
||||
|
||||
rouC_BT = rouC_allT.filter(regex='bts_', axis=1); rouC_BT.columns
|
||||
# map colnames for consistency to allow concatenting
|
||||
rouC_BT.columns = rouC_BT.columns.map(scoreBT_mapD); rouC_BT.columns
|
||||
rouC_BT['Data_source'] = 'BT'
|
||||
rouC_BT['Resampling'] = rs_rouC
|
||||
|
||||
# Write csv
|
||||
# rouC_CV.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
# rouC_CV.to_csv(outdir_ml + gene.lower() + '_rouC_CV_allF.csv')
|
||||
# rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
# rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')
|
||||
|
||||
#----------------------------------
|
||||
# LF*: CV + BTS + Other info
|
||||
#----------------------------------
|
||||
# other data dependent variables
|
||||
training_size_rouC = len(X_rouC)
|
||||
n_features = len(X_rouC.columns)
|
||||
yc1_rouC = Counter(y_rouC)
|
||||
yc1_ratio_rouC = yc1_rouC[0]/yc1_rouC[1]
|
||||
|
||||
rouC_all = rouC_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
|
||||
rouC_all = rouC_all.reset_index()
|
||||
rouC_all.rename(columns = {'index': 'original_names'}, inplace = True)
|
||||
|
||||
# Indicate whether BT or CT
|
||||
# Indicate whether BT or CV
|
||||
bt_pattern = re.compile(r'bts_.*')
|
||||
rouC_all['data_source'] = rouC_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
|
||||
rouC_all['Data_source'] = rouC_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
|
||||
|
||||
rouC_all['score_type'] = rouC_all['original_names'].str.replace('bts_|test_', '', regex = True)
|
||||
|
||||
|
@ -413,29 +470,76 @@ cL2 = list(score_type_uniqueN)
|
|||
if set(cL1).issubset(cL2):
|
||||
print('\nPASS: sorting df by score that is mapped onto the order I want')
|
||||
rouC_all['score_order'] = rouC_all['score_type'].map(score_type_ordermapD)
|
||||
rouC_all.sort_values(by = ['data_source', 'score_order'], ascending = [True, True], inplace = True)
|
||||
rouC_all.sort_values(by = ['Data_source', 'score_order'], ascending = [True, True], inplace = True)
|
||||
else:
|
||||
sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
|
||||
|
||||
# add cols: specific
|
||||
rouC_all['resampling'] = 'rouC'
|
||||
rouC_all['training_size'] = training_size_rouC
|
||||
|
||||
# add cols: common
|
||||
rouC_all['n_features'] = n_features
|
||||
#rouC_all['test_size'] = bts_size
|
||||
#rouC_all['tts_split'] = tts_split
|
||||
|
||||
|
||||
|
||||
|
||||
rouC_all['Resampling'] = rs_rouC
|
||||
rouC_all['training_size'] = training_size_rouC
|
||||
rouC_all['trainingY_ratio'] = round(yc1_ratio_rouC,2)
|
||||
rouC_all['n_features'] = n_features
|
||||
|
||||
###############################################################################
|
||||
#%% COMBINING all FG dfs
|
||||
#================
|
||||
# Combine all
|
||||
#%% COMBINING all dfs: WF and LF
|
||||
# https://stackoverflow.com/questions/39862654/pandas-concat-of-multiple-data-frames-using-only-common-columns
|
||||
#================
|
||||
#%% Combine WF
|
||||
#-----------------
|
||||
# Combine WF
|
||||
#-----------------
|
||||
dfs_combine_wf = [baseline_BT, smnc_BT, ros_BT, rus_BT, rouC_BT,
|
||||
baseline_CV, smnc_CV, ros_CV, rus_CV, rouC_CV]
|
||||
|
||||
dfs_nrows_wf = []
|
||||
for df in dfs_combine_wf:
|
||||
dfs_nrows_wf = dfs_nrows_wf + [len(df)]
|
||||
dfs_nrows_wf = max(dfs_nrows_wf)
|
||||
|
||||
dfs_ncols_wf = []
|
||||
for df in dfs_combine_wf:
|
||||
dfs_ncols_wf = dfs_ncols_wf + [len(df.columns)]
|
||||
dfs_ncols_wf = max(dfs_ncols_wf)
|
||||
|
||||
expected_nrows_wf = len(dfs_combine_wf) * dfs_nrows_wf
|
||||
expected_ncols_wf = dfs_ncols_wf
|
||||
|
||||
common_cols_wf = list(set.intersection(*(set(df.columns) for df in dfs_combine_wf)))
|
||||
|
||||
if len(common_cols_wf) == dfs_ncols_wf :
|
||||
combined_baseline_wf = pd.concat([df[common_cols_wf] for df in dfs_combine_wf], ignore_index=False)
|
||||
resampling_methods_wf = combined_baseline_wf[['Resampling']]
|
||||
resampling_methods_wf = resampling_methods_wf.drop_duplicates()
|
||||
print('\nConcatenating dfs with different resampling methods [WF]:', tts_split
|
||||
, '\nNo. of dfs combining:', len(dfs_combine_wf)
|
||||
, '\nThe sampling methods are:'
|
||||
, '\n', resampling_methods_wf)
|
||||
if len(combined_baseline_wf) == expected_nrows_wf and len(combined_baseline_wf.columns) == expected_ncols_wf:
|
||||
print('\nPASS:', len(dfs_combine_wf), 'dfs successfully combined'
|
||||
, '\nnrows in combined_df_wf:', len(combined_baseline_wf)
|
||||
, '\nncols in combined_df_wf:', len(combined_baseline_wf.columns))
|
||||
else:
|
||||
print('\nFAIL: concatenating failed'
|
||||
, '\nExpected nrows:', expected_nrows_wf
|
||||
, '\nGot:', len(combined_baseline_wf)
|
||||
, '\nExpected ncols:', expected_ncols_wf
|
||||
, '\nGot:', len(combined_baseline_wf.columns))
|
||||
sys.exit()
|
||||
else:
|
||||
sys.exit('\nConcatenting dfs not possible [WF],check numbers ')
|
||||
|
||||
# Add index as a column
|
||||
combined_baseline_wf.columns
|
||||
combined_baseline_wf = combined_baseline_wf.reset_index()
|
||||
combined_baseline_wf.rename(columns = {'index': 'Model_name'}, inplace = True)
|
||||
combined_baseline_wf.head()
|
||||
|
||||
# sort df: Resampling, Data_source, and MCC
|
||||
combined_baseline_wf.sort_values(by = ['Resampling', 'Data_source', 'MCC'], ascending = [True, True, False], inplace = True)
|
||||
##############################################################################
|
||||
#%% Combine LF
|
||||
#-----------------
|
||||
# Combine LF*
|
||||
#-----------------
|
||||
dfs_combine = [baseline_all, smnc_all, ros_all, rus_all, rouC_all ]
|
||||
|
||||
dfs_nrows = []
|
||||
|
@ -459,7 +563,7 @@ common_cols = list(set.intersection(*(set(df.columns) for df in dfs_combine)))
|
|||
|
||||
if len(common_cols) == dfs_ncols :
|
||||
combined_baseline = pd.concat([df[common_cols] for df in dfs_combine], ignore_index=True)
|
||||
resampling_methods = combined_baseline[['resampling', 'training_size']]
|
||||
resampling_methods = combined_baseline[['Resampling', 'training_size']]
|
||||
resampling_methods = resampling_methods.drop_duplicates()
|
||||
print('\nConcatenating dfs with different resampling methods:', tts_split
|
||||
, '\nNo. of dfs combining:', len(dfs_combine)
|
||||
|
@ -480,20 +584,30 @@ else:
|
|||
sys.exit('\nConcatenting dfs not possible,check numbers ')
|
||||
|
||||
# Add further column indications
|
||||
combined_baseline['test_size'] = bts_size
|
||||
combined_baseline['tts_split'] = tts_split
|
||||
combined_baseline['test_size'] = bts_size
|
||||
combined_baseline['tts_split'] = tts_split
|
||||
combined_baseline['testY_ratio'] = round(yc2_ratio,2)
|
||||
#combined_baseline.columns
|
||||
|
||||
# TODO:
|
||||
# ADD y target ration for all
|
||||
# change to column names to be lower case for consistency
|
||||
combined_baseline.rename(columns = {'Resampling' : 'resampling'
|
||||
, 'Data_source': 'data_source'}, inplace = True)
|
||||
combined_baseline.columns
|
||||
|
||||
# sort df: resampling, data_source, mcc
|
||||
combined_baseline.sort_values(by = ['resampling', 'data_source', 'score_order'], ascending = [True, True, True], inplace = True)
|
||||
|
||||
# # rpow bind
|
||||
# if all(ll((baseline_all.columns == baseline_GN.columns == baseline_STR.columns)):
|
||||
# if all(XXX):
|
||||
# print('\nPASS:colnames match, proceeding to rowbind')
|
||||
# comb_df = pd.concat()], axis = 0, ignore_index = True ) combined_baseline
|
||||
###############################################################################
|
||||
#====================
|
||||
# Write output file
|
||||
#====================
|
||||
combined_baseline.to_csv(outFile, index = False)
|
||||
print('\nFile successfully written:', outFile)
|
||||
combined_baseline_wf.to_csv(outFile_wf, index = False)
|
||||
print('\nFile successfully written:', outFile_wf)
|
||||
|
||||
combined_baseline.to_csv(outFile_lf, index = False)
|
||||
print('\nFile successfully written:', outFile_lf)
|
||||
###############################################################################
|
Loading…
Add table
Add a link
Reference in a new issue