optimised run_7030.py to generate ouput from dict now that the processfunction and parameter dicts have been added

This commit is contained in:
Tanushree Tunstall 2022-06-24 15:40:18 +01:00
parent 7dc7e25016
commit b37a950fec
12 changed files with 180 additions and 128408 deletions

635
scripts/ml/run_7030.py Executable file → Normal file
View file

@ -5,28 +5,28 @@ Created on Mon Jun 20 13:05:23 2022
@author: tanu
"""
#%%Imports ####################################################################
import re
import argparse
###############################################################################
import os, sys
# gene = 'pncA'
# drug = 'pyrazinamide'
#total_mtblineage_uc = 8
###############################################################################
#%% command line args: case sensitive
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '')
arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '')
args = arg_parser.parse_args()
# arg_parser = argparse.ArgumentParser()
# arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '')
# arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '')
# args = arg_parser.parse_args()
drug = args.drug
gene = args.gene
# drug = args.drug
# gene = args.gene
###############################################################################
#==================
# other vars
#==================
tts_split = '70/30'
OutFile_suffix = '7030'
homedir = os.path.expanduser("~")
sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml')
###############################################################################
#==================
# Import data
@ -39,10 +39,15 @@ from ml_data_7030 import *
#from UQ_yc_RunAllClfs import run_all_ML
#====================
# Import ML function
# Import ML functions
#====================
# TT run all ML clfs: baseline model
from MultModelsCl import MultModelsCl
from MultClfs import *
#==================
# other vars
#==================
tts_split_7030 = '70_30'
OutFile_suffix = '7030'
#==================
# Specify outdir
@ -51,563 +56,71 @@ outdir_ml = outdir + 'ml/tts_7030/'
print('\nOutput directory:', outdir_ml)
outFile_wf = outdir_ml + gene.lower() + '_baselineC_' + OutFile_suffix + '.csv'
outFile_lf = outdir_ml + gene.lower() + '_baselineC_ext_' + OutFile_suffix + '.csv'
#outFile_lf = outdir_ml + gene.lower() + '_baselineC_ext_' + OutFile_suffix + '.csv'
###############################################################################
score_type_ordermapD = { 'mcc' : 1
, 'fscore' : 2
, 'jcc' : 3
, 'precision' : 4
, 'recall' : 5
, 'accuracy' : 6
, 'roc_auc' : 7
, 'TN' : 8
, 'FP' : 9
, 'FN' : 10
, 'TP' : 11
, 'trainingY_neg': 12
, 'trainingY_pos': 13
, 'blindY_neg' : 14
, 'blindY_pos' : 15
, 'fit_time' : 16
, 'score_time' : 17
}
scoreCV_mapD = {'test_mcc' : 'MCC'
, 'test_fscore' : 'F1'
, 'test_precision' : 'Precision'
, 'test_recall' : 'Recall'
, 'test_accuracy' : 'Accuracy'
, 'test_roc_auc' : 'ROC_AUC'
, 'test_jcc' : 'JCC'
}
scoreBT_mapD = {'bts_mcc' : 'MCC'
, 'bts_fscore' : 'F1'
, 'bts_precision' : 'Precision'
, 'bts_recall' : 'Recall'
, 'bts_accuracy' : 'Accuracy'
, 'bts_roc_auc' : 'ROC_AUC'
, 'bts_jcc' : 'JCC'
}
# data dependent variables but NOT dependent on resampling
bts_size = len(X_bts)
yc2 = Counter(y_bts)
yc2_ratio = yc2[0]/yc2[1]
###############################################################################
#%% Running models ############################################################
print('\n#####################################################################\n'
, '\nRunning ML analysis: feature groups '
, '\nStarting--> Running ML analysis: Baseline modes (No FS)'
, '\nGene name:', gene
, '\nDrug name:', drug)
, '\nDrug name:', drug
, '\n#####################################################################\n')
#%% Basic: No Oversampling
#================
# Baseline
# No resampling
#================
scores_mmD = MultModelsCl(input_df = X
, target = y
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_df = X_bts
, blind_test_target = y_bts
, add_cm = True
, add_yn = True)
baseline_all_scores = pd.DataFrame(scores_mmD)
rs_none = 'none'
#------------------------
# WF: only CV and BTS
#-----------------------
baseline_allT = baseline_all_scores.T
#baseline_train = baseline_all.filter(regex='train_', axis=1)
baseline_CV = baseline_allT.filter(regex='test_', axis = 1); baseline_CV.columns
# map colnames for consistency to allow concatenting
baseline_CV.columns = baseline_CV.columns.map(scoreCV_mapD); baseline_CV.columns
baseline_CV['Data_source'] = 'CV'
baseline_CV['Resampling'] = rs_none
baseline_BT = baseline_allT.filter(regex='bts_', axis = 1); baseline_BT.columns
# map colnames for consistency to allow concatenting
baseline_BT.columns = baseline_BT.columns.map(scoreBT_mapD); baseline_BT.columns
baseline_BT['Data_source'] = 'BT'
baseline_BT['Resampling'] = rs_none
# # Write csv
#baseline_CV.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
#baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# baseline_CV.to_csv(outdir_ml + gene.lower() + '_baseline_CV_allF.csv')
# baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
#----------------------------------
# LF*: CV + BTS + Other info
#-----------------------------------
# other data dependent variables
training_size_ns = len(X)
n_features = len(X.columns)
yc1 = Counter(y)
yc1_ratio = yc1[0]/yc1[1]
baseline_all = baseline_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
baseline_all = baseline_all.reset_index()
baseline_all.rename(columns = {'index': 'original_names'}, inplace = True)
# Indicate whether BT or CV
bt_pattern = re.compile(r'bts_.*')
baseline_all['Data_source'] = baseline_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
baseline_all['score_type'] = baseline_all['original_names'].str.replace('bts_|test_', '', regex = True)
score_type_uniqueN = set(baseline_all['score_type'])
cL1 = list(score_type_ordermapD.keys())
cL2 = list(score_type_uniqueN)
if set(cL1).issubset(cL2):
print('\nPASS: sorting df by score that is mapped onto the order I want')
baseline_all['score_order'] = baseline_all['score_type'].map(score_type_ordermapD)
baseline_all.sort_values(by = ['Data_source', 'score_order'], ascending = [True, True], inplace = True)
else:
sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
# add cols: specific
baseline_all['Resampling'] = rs_none
baseline_all['training_size'] = training_size_ns
baseline_all['trainingY_ratio']= round(yc1_ratio,2)
baseline_all['n_features'] = n_features
###############################################################################
#%% SMOTE NC: Smote Oversampling [Numerical + categorical]
#================
# Baseline
# SMOTE NC: SMNC
#================
smnc_scores_mmD = MultModelsCl(input_df = X_smnc
, target = y_smnc
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_df = X_bts
, blind_test_target = y_bts
, add_cm = True
, add_yn = True)
smnc_all_scores = pd.DataFrame(smnc_scores_mmD)
rs_smnc = 'smnc'
#------------------------
# WF: only CV and BTS
#-----------------------
smnc_allT = smnc_all_scores.T
smnc_CV = smnc_allT.filter(regex='test_', axis = 1); smnc_CV.columns
# map colnames for consistency to allow concatenting
smnc_CV.columns = smnc_CV.columns.map(scoreCV_mapD); smnc_CV.columns
smnc_CV['Data_source'] = 'CV'
smnc_CV['Resampling'] = rs_smnc
smnc_BT = smnc_allT.filter(regex='bts_', axis = 1); smnc_BT.columns
# map colnames for consistency to allow concatenting
smnc_BT.columns = smnc_BT.columns.map(scoreBT_mapD); smnc_BT.columns
smnc_BT['Data_source'] = 'BT'
smnc_BT['Resampling'] = rs_smnc
# Write csv
# smnc_CV.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
# smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# smnc_CV.to_csv(outdir_ml + gene.lower() + '_smnc_CV_allF.csv')
# smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
#----------------------------------
# LF*: CV + BTS + Other info
#-----------------------------------
# other data dependent variables
training_size_smnc = len(X_smnc)
n_features = len(X_smnc.columns)
yc1_smnc = Counter(y_smnc)
yc1_ratio_smnc = yc1_smnc[0]/yc1_smnc[1]
smnc_all = smnc_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
smnc_all = smnc_all.reset_index()
smnc_all.rename(columns = {'index': 'original_names'}, inplace = True)
# Indicate whether BT or CV
bt_pattern = re.compile(r'bts_.*')
smnc_all['Data_source'] = smnc_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
smnc_all['score_type'] = smnc_all['original_names'].str.replace('bts_|test_', '', regex = True)
score_type_uniqueN = set(smnc_all['score_type'])
cL1 = list(score_type_ordermapD.keys())
cL2 = list(score_type_uniqueN)
if set(cL1).issubset(cL2):
print('\nPASS: sorting df by score that is mapped onto the order I want')
smnc_all['score_order'] = smnc_all['score_type'].map(score_type_ordermapD)
smnc_all.sort_values(by = ['Data_source', 'score_order'], ascending = [True, True], inplace = True)
else:
sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
# add cols: specific
smnc_all['Resampling'] = rs_smnc
smnc_all['training_size'] = training_size_smnc
smnc_all['trainingY_ratio'] = round(yc1_ratio_smnc,2)
smnc_all['n_features'] = n_features
###############################################################################
#%% ROS: Random Over Sampling [Numerical + categorical]
#================
# Baseline
# ROS
#================
ros_scores_mmD = MultModelsCl(input_df = X_ros
, target = y_ros
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_df = X_bts
, blind_test_target = y_bts
, add_cm = True
, add_yn = True)
ros_all_scores = pd.DataFrame(ros_scores_mmD)
rs_ros = 'ros'
#------------------------
# WF: only CV and BTS
#-----------------------
ros_allT = ros_all_scores.T
ros_CV = ros_allT.filter(regex='test_', axis = 1); ros_CV.columns
# map colnames for consistency to allow concatenting
ros_CV.columns = ros_CV.columns.map(scoreCV_mapD); ros_CV.columns
ros_CV['Data_source'] = 'CV'
ros_CV['Resampling'] = rs_ros
ros_BT = ros_allT.filter(regex='bts_', axis = 1); ros_BT.columns
# map colnames for consistency to allow concatenting
ros_BT.columns = ros_BT.columns.map(scoreBT_mapD); ros_BT.columns
ros_BT['Data_source'] = 'BT'
ros_BT['Resampling'] = rs_ros
# Write csv
# ros_CV.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
# ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# ros_CV.to_csv(outdir_ml + gene.lower() + '_ros_CV_allF.csv')
# ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
#----------------------------------
# LF*: CV + BTS + Other info
#----------------------------------
# other data dependent variables
training_size_ros = len(X_ros)
n_features = len(X_ros.columns)
yc1_ros = Counter(y_ros)
yc1_ratio_ros = yc1_ros[0]/yc1_ros[1]
ros_all = ros_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
ros_all = ros_all.reset_index()
ros_all.rename(columns = {'index': 'original_names'}, inplace = True)
# Indicate whether BT or CV
bt_pattern = re.compile(r'bts_.*')
ros_all['Data_source'] = ros_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
ros_all['score_type'] = ros_all['original_names'].str.replace('bts_|test_', '', regex = True)
score_type_uniqueN = set(ros_all['score_type'])
cL1 = list(score_type_ordermapD.keys())
cL2 = list(score_type_uniqueN)
if set(cL1).issubset(cL2):
print('\nPASS: sorting df by score that is mapped onto the order I want')
ros_all['score_order'] = ros_all['score_type'].map(score_type_ordermapD)
ros_all.sort_values(by = ['Data_source', 'score_order'], ascending = [True, True], inplace = True)
else:
sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
# add cols: specific
ros_all['Resampling'] = rs_ros
ros_all['training_size'] = training_size_ros
ros_all['trainingY_ratio'] = round(yc1_ratio_ros,2)
ros_all['n_features'] = n_features
###############################################################################
#%% RUS: Random Under Sampling [Numerical + categorical]
#================
# Baseline
# RUS
#================
rus_scores_mmD = MultModelsCl(input_df = X_rus
, target = y_rus
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_df = X_bts
, blind_test_target = y_bts
, add_cm = True
, add_yn = True)
rus_all_scores = pd.DataFrame(rus_scores_mmD)
rs_rus = 'rus'
#-----------------------
# WF: only CV and BTS
#-----------------------
rus_allT = rus_all_scores.T
rus_CV = rus_allT.filter(regex='test_', axis=1); rus_CV.columns
# map colnames for consistency to allow concatenting
rus_CV.columns = rus_CV.columns.map(scoreCV_mapD); rus_CV.columns
rus_CV['Data_source'] = 'CV'
rus_CV['Resampling'] = rs_rus
rus_BT = rus_allT.filter(regex='bts_' , axis=1); rus_BT.columns
# map colnames for consistency to allow concatenting
rus_BT.columns = rus_BT.columns.map(scoreBT_mapD); rus_BT.columns
rus_BT['Data_source'] = 'BT'
rus_BT['Resampling'] = rs_rus
# # Write csv
# rus_CV.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
# rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# rus_CV.to_csv(outdir_ml + gene.lower() + '_rus_CV_allF.csv')
# rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
#----------------------------------
# LF*: CV + BTS + Other info
#----------------------------------
# other data dependent variables
training_size_rus = len(X_rus)
n_features = len(X_rus.columns)
yc1_rus = Counter(y_rus)
yc1_ratio_rus = yc1_rus[0]/yc1_rus[1]
rus_all = rus_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
rus_all = rus_all.reset_index()
rus_all.rename(columns = {'index': 'original_names'}, inplace = True)
# Indicate whether BT or CV
bt_pattern = re.compile(r'bts_.*')
rus_all['Data_source'] = rus_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
rus_all['score_type'] = rus_all['original_names'].str.replace('bts_|test_', '', regex = True)
score_type_uniqueN = set(rus_all['score_type'])
cL1 = list(score_type_ordermapD.keys())
cL2 = list(score_type_uniqueN)
if set(cL1).issubset(cL2):
print('\nPASS: sorting df by score that is mapped onto the order I want')
rus_all['score_order'] = rus_all['score_type'].map(score_type_ordermapD)
rus_all.sort_values(by = ['Data_source', 'score_order'], ascending = [True, True], inplace = True)
else:
sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
# add cols: specific
rus_all['Resampling'] = rs_rus
rus_all['training_size'] = training_size_rus
rus_all['trainingY_ratio'] = round(yc1_ratio_rus,2)
rus_all['n_features'] = n_features
###############################################################################
#%% ROS+RUS Combined: [Numerical + categorical]
#================
# Baseline
# ROUC
#================
rouC_scores_mmD = MultModelsCl(input_df = X_rouC
, target = y_rouC
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_df = X_bts
, blind_test_target = y_bts
, add_cm = True
, add_yn = True)
rouC_all_scores = pd.DataFrame(rouC_scores_mmD)
rs_rouC = 'rouC'
#-----------------------
# WF: only CV and BTS
#-----------------------
rouC_allT = rouC_all_scores.T
rouC_CV = rouC_allT.filter(regex='test_', axis=1); rouC_CV.columns
# map colnames for consistency to allow concatenting
rouC_CV.columns = rouC_CV.columns.map(scoreCV_mapD); rouC_CV.columns
rouC_CV['Data_source'] = 'CV'
rouC_CV['Resampling'] = rs_rouC
rouC_BT = rouC_allT.filter(regex='bts_', axis=1); rouC_BT.columns
# map colnames for consistency to allow concatenting
rouC_BT.columns = rouC_BT.columns.map(scoreBT_mapD); rouC_BT.columns
rouC_BT['Data_source'] = 'BT'
rouC_BT['Resampling'] = rs_rouC
# Write csv
# rouC_CV.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
# rouC_CV.to_csv(outdir_ml + gene.lower() + '_rouC_CV_allF.csv')
# rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')
#----------------------------------
# LF*: CV + BTS + Other info
#----------------------------------
# other data dependent variables
training_size_rouC = len(X_rouC)
n_features = len(X_rouC.columns)
yc1_rouC = Counter(y_rouC)
yc1_ratio_rouC = yc1_rouC[0]/yc1_rouC[1]
rouC_all = rouC_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
rouC_all = rouC_all.reset_index()
rouC_all.rename(columns = {'index': 'original_names'}, inplace = True)
# Indicate whether BT or CV
bt_pattern = re.compile(r'bts_.*')
rouC_all['Data_source'] = rouC_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
rouC_all['score_type'] = rouC_all['original_names'].str.replace('bts_|test_', '', regex = True)
score_type_uniqueN = set(rouC_all['score_type'])
cL1 = list(score_type_ordermapD.keys())
cL2 = list(score_type_uniqueN)
if set(cL1).issubset(cL2):
print('\nPASS: sorting df by score that is mapped onto the order I want')
rouC_all['score_order'] = rouC_all['score_type'].map(score_type_ordermapD)
rouC_all.sort_values(by = ['Data_source', 'score_order'], ascending = [True, True], inplace = True)
else:
sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
# add cols: specific
rouC_all['Resampling'] = rs_rouC
rouC_all['training_size'] = training_size_rouC
rouC_all['trainingY_ratio'] = round(yc1_ratio_rouC,2)
rouC_all['n_features'] = n_features
###############################################################################
#%% COMBINING all dfs: WF and LF
# https://stackoverflow.com/questions/39862654/pandas-concat-of-multiple-data-frames-using-only-common-columns
#%% Combine WF
#-----------------
# Combine WF
#-----------------
dfs_combine_wf = [baseline_BT, smnc_BT, ros_BT, rus_BT, rouC_BT,
baseline_CV, smnc_CV, ros_CV, rus_CV, rouC_CV]
dfs_nrows_wf = []
for df in dfs_combine_wf:
dfs_nrows_wf = dfs_nrows_wf + [len(df)]
dfs_nrows_wf = max(dfs_nrows_wf)
paramD = {
'baseline_paramD': { 'input_df' : X
, 'target' : y
, 'var_type' : 'mixed'
, 'resampling_type': 'none'}
, 'smnc_paramD': { 'input_df' : X_smnc
, 'target' : y_smnc
, 'var_type' : 'mixed'
, 'resampling_type' : 'smnc'}
dfs_ncols_wf = []
for df in dfs_combine_wf:
dfs_ncols_wf = dfs_ncols_wf + [len(df.columns)]
dfs_ncols_wf = max(dfs_ncols_wf)
, 'ros_paramD': { 'input_df' : X_ros
, 'target' : y_ros
, 'var_type' : 'mixed'
, 'resampling_type' : 'ros'}
expected_nrows_wf = len(dfs_combine_wf) * dfs_nrows_wf
expected_ncols_wf = dfs_ncols_wf
, 'rus_paramD' : { 'input_df' : X_rus
, 'target' : y_rus
, 'var_type' : 'mixed'
, 'resampling_type' : 'rus'}
common_cols_wf = list(set.intersection(*(set(df.columns) for df in dfs_combine_wf)))
, 'rouC_paramD' : { 'input_df' : X_rouC
, 'target' : y_rouC
, 'var_type' : 'mixed'
, 'resampling_type' : 'rouC'}
}
if len(common_cols_wf) == dfs_ncols_wf :
combined_baseline_wf = pd.concat([df[common_cols_wf] for df in dfs_combine_wf], ignore_index=False)
resampling_methods_wf = combined_baseline_wf[['Resampling']]
resampling_methods_wf = resampling_methods_wf.drop_duplicates()
print('\nConcatenating dfs with different resampling methods [WF]:', tts_split
, '\nNo. of dfs combining:', len(dfs_combine_wf)
, '\nThe sampling methods are:'
, '\n', resampling_methods_wf)
if len(combined_baseline_wf) == expected_nrows_wf and len(combined_baseline_wf.columns) == expected_ncols_wf:
print('\nPASS:', len(dfs_combine_wf), 'dfs successfully combined'
, '\nnrows in combined_df_wf:', len(combined_baseline_wf)
, '\nncols in combined_df_wf:', len(combined_baseline_wf.columns))
else:
print('\nFAIL: concatenating failed'
, '\nExpected nrows:', expected_nrows_wf
, '\nGot:', len(combined_baseline_wf)
, '\nExpected ncols:', expected_ncols_wf
, '\nGot:', len(combined_baseline_wf.columns))
sys.exit()
else:
sys.exit('\nConcatenting dfs not possible [WF],check numbers ')
# Initial run to get the dict containing CV, BT and metadata DFs
mmD = {}
for k, v in paramD.items():
# print(mmD[k])
scores_7030D = MultModelsCl(**paramD[k]
, tts_split_type = tts_split_7030
, skf_cv = skf_cv
, blind_test_df = X_bts
, blind_test_target = y_bts
, add_cm = True
, add_yn = True
, return_formatted_output = True)
mmD[k] = scores_7030D
# Extracting the dfs from within the dict and concatenating to output as one df
for k, v in mmD.items():
out_wf_7030 = pd.concat(mmD, ignore_index = True)
# Add index as a column
combined_baseline_wf.columns
combined_baseline_wf = combined_baseline_wf.reset_index()
combined_baseline_wf.rename(columns = {'index': 'Model_name'}, inplace = True)
combined_baseline_wf.head()
# sort df: Resampling, Data_source, and MCC
combined_baseline_wf.sort_values(by = ['Resampling', 'Data_source', 'MCC'], ascending = [True, True, False], inplace = True)
##############################################################################
#%% Combine LF
#-----------------
# Combine LF*
#-----------------
dfs_combine = [baseline_all, smnc_all, ros_all, rus_all, rouC_all ]
dfs_nrows = []
for df in dfs_combine:
dfs_nrows = dfs_nrows + [len(df)]
dfs_nrows = max(dfs_nrows)
dfs_ncols = []
for df in dfs_combine:
dfs_ncols = dfs_ncols + [len(df.columns)]
dfs_ncols = max(dfs_ncols)
# dfs_ncols = []
# dfs_ncols2 = mode(dfs_ncols.append(len(df.columns) for df in dfs_combine)
# dfs_ncols2
expected_nrows = len(dfs_combine) * dfs_nrows
expected_ncols = dfs_ncols
common_cols = list(set.intersection(*(set(df.columns) for df in dfs_combine)))
if len(common_cols) == dfs_ncols :
combined_baseline = pd.concat([df[common_cols] for df in dfs_combine], ignore_index=True)
resampling_methods = combined_baseline[['Resampling', 'training_size']]
resampling_methods = resampling_methods.drop_duplicates()
print('\nConcatenating dfs with different resampling methods:', tts_split
, '\nNo. of dfs combining:', len(dfs_combine)
, '\nThe sampling methods are:'
, '\n', resampling_methods)
if len(combined_baseline) == expected_nrows and len(combined_baseline.columns) == expected_ncols:
print('\nPASS:', len(dfs_combine), 'dfs successfully combined'
, '\nnrows in combined_df:', len(combined_baseline)
, '\nncols in combined_df:', len(combined_baseline.columns))
else:
print('\nFAIL: concatenating failed'
, '\nExpected nrows:', expected_nrows
, '\nGot:', len(combined_baseline)
, '\nExpected ncols:', expected_ncols
, '\nGot:', len(combined_baseline.columns))
sys.exit()
else:
sys.exit('\nConcatenting dfs not possible,check numbers ')
# Add further column indications
combined_baseline['test_size'] = bts_size
combined_baseline['tts_split'] = tts_split
combined_baseline['testY_ratio'] = round(yc2_ratio,2)
#combined_baseline.columns
# change to column names to be lower case for consistency
combined_baseline.rename(columns = {'Resampling' : 'resampling'
, 'Data_source': 'data_source'}, inplace = True)
combined_baseline.columns
# sort df: resampling, data_source, mcc
combined_baseline.sort_values(by = ['resampling', 'data_source', 'score_order'], ascending = [True, True, True], inplace = True)
# # rpow bind
# if all(XXX):
# print('\nPASS:colnames match, proceeding to rowbind')
# comb_df = pd.concat()], axis = 0, ignore_index = True ) combined_baseline
print('\n######################################################################'
, '\nEnd--> Successfully generated output DF for Multiple classifiers (baseline models)'
, '\nGene:', gene.lower()
, '\nDrug:', drug
, '\noutput file:', outFile_wf
, '\nDim of output:', out_wf_7030.shape
, '\n######################################################################')
###############################################################################
#====================
# Write output file
#====================
combined_baseline_wf.to_csv(outFile_wf, index = False)
#out_wf_7030.to_csv(outFile_wf, index = False)
print('\nFile successfully written:', outFile_wf)
combined_baseline.to_csv(outFile_lf, index = False)
print('\nFile successfully written:', outFile_lf)
###############################################################################