various edits

This commit is contained in:
Tanushree Tunstall 2022-07-28 13:19:30 +01:00
parent 90b9477520
commit 8d8a61675f
3 changed files with 601 additions and 1 deletions

View file

@ -15,7 +15,8 @@ sys.path
from GetMLData import *
from SplitTTS import *
from MultClfs import *
#from MultClfs_SIMPLE import *
from MultClfs_noBTS import *
#%%
rs = {'random_state': 42}
@ -69,6 +70,8 @@ len(df)
Counter(df2['y'])
Counter(df2['y_bts'])
#%% Run Multiple models
fooD = MultModelsCl(input_df = df2['X']
, target = df2['y']
, sel_cv = skf_cv
@ -140,3 +143,106 @@ from sklearn.utils import all_estimators
all_clfs = all_estimators(type_filter="classifier")
df = pd.DataFrame (all_clfs, columns = ['classifier_name', 'classifier_fn'])
df.to_csv("Model_names_ALL.csv")
#%% TEST different CV Thresholds for split_type = NONE
Counter(df2['y'])
Counter(df2['y_bts'])
spl_type = 'none'
data_type = "complete"
df2 = split_tts(df
, data_type = data_type
, split_type = spl_type
, oversampling = True
, dst_colname = 'dst'
, target_colname = 'dst_mode'
, include_gene_name = True
, random_state = 42 # default
)
fooD = MultModelsCl_noBTS(input_df = df2['X']
, target = df2['y']
, skf_cv_threshold = 10 # IMP to change
, tts_split_type = spl_type
, resampling_type = 'XXXX' # default
, add_cm = True # adds confusion matrix based on cross_val_predict
, add_yn = True # adds target var class numbers
, var_type = ['mixed']
, scale_numeric = ['min_max']
, random_state = 42
, n_jobs = os.cpu_count()
, return_formatted_output = False
)
for k, v in fooD.items():
print('\nModel:', k
, '\nTRAIN MCC:', fooD[k]['test_mcc']
)
# formatted df
foo_df3 = MultModelsCl_noBTS(input_df = df2['X']
, target = df2['y']
, skf_cv_threshold = 5 # IMP to change
, tts_split_type = spl_type
, resampling_type = 'XXXX' # default
, add_cm = True # adds confusion matrix based on cross_val_predict
, add_yn = True # adds target var class numbers
, var_type = ['mixed']
, scale_numeric = ['min_max']
, random_state = 42
, n_jobs = os.cpu_count()
, return_formatted_output = True
)
dfs_combine_wf = [foo_df, foo_df2, foo_df3]
common_cols_wf = list(set.intersection(*(set(df.columns) for df in dfs_combine_wf)))
print('\nCombinig', len(dfs_combine_wf), 'using pd.concat by row ~ rowbind'
, '\nChecking Dims of df to combine:'
, '\nDim of CV:', scoresDF_CV.shape
, '\nDim of BT:', scoresDF_BT.shape)
#print(scoresDF_CV)
#print(scoresDF_BT)
dfs_nrows_wf = []
for df in dfs_combine_wf:
dfs_nrows_wf = dfs_nrows_wf + [len(df)]
dfs_nrows_wf = max(dfs_nrows_wf)
dfs_ncols_wf = []
for df in dfs_combine_wf:
dfs_ncols_wf = dfs_ncols_wf + [len(df.columns)]
dfs_ncols_wf = max(dfs_ncols_wf)
print(dfs_ncols_wf)
expected_nrows_wf = len(dfs_combine_wf) * dfs_nrows_wf
expected_ncols_wf = dfs_ncols_wf
if len(common_cols_wf) == dfs_ncols_wf :
combined_baseline_wf = pd.concat([df[common_cols_wf] for df in dfs_combine_wf], ignore_index=False)
print('\nConcatenating dfs with different resampling methods [WF]:'
, '\nSplit type:', spl_type
, '\nNo. of dfs combining:', len(dfs_combine_wf))
if len(combined_baseline_wf) == expected_nrows_wf and len(combined_baseline_wf.columns) == expected_ncols_wf:
print('\nPASS:', len(dfs_combine_wf), 'dfs successfully combined'
, '\nnrows in combined_df_wf:', len(combined_baseline_wf)
, '\nncols in combined_df_wf:', len(combined_baseline_wf.columns))
else:
print('\nFAIL: concatenating failed'
, '\nExpected nrows:', expected_nrows_wf
, '\nGot:', len(combined_baseline_wf)
, '\nExpected ncols:', expected_ncols_wf
, '\nGot:', len(combined_baseline_wf.columns))
sys.exit('\nFIRST IF FAILS')