trying diff cv thresholds for single gene
This commit is contained in:
parent
8d8a61675f
commit
b87f8d0295
2 changed files with 54 additions and 461 deletions
|
@ -15,8 +15,7 @@ sys.path
|
|||
from GetMLData import *
|
||||
from SplitTTS import *
|
||||
from MultClfs import *
|
||||
from MultClfs_noBTS import *
|
||||
|
||||
from MultClfs_CVs import *
|
||||
|
||||
#%%
|
||||
rs = {'random_state': 42}
|
||||
|
@ -27,6 +26,7 @@ skf_cv = StratifiedKFold(n_splits = 10
|
|||
# , n_repeats = 3
|
||||
# , **rs)
|
||||
# param dict for getmldata()
|
||||
#%% READ data
|
||||
gene_model_paramD = {'data_combined_model' : False
|
||||
, 'use_or' : False
|
||||
, 'omit_all_genomic_features': False
|
||||
|
@ -40,7 +40,7 @@ df = getmldata('embB', 'ethambutol' , **gene_model_paramD)
|
|||
#df = getmldata('rpoB', 'rifampicin' , **gene_model_paramD)
|
||||
#df = getmldata('gid' , 'streptomycin' , **gene_model_paramD)
|
||||
#df = getmldata('alr' , 'cycloserine' , **gene_model_paramD)
|
||||
|
||||
#%% SPLIT, Data and Resampling types
|
||||
all(df.columns.isin(['gene_name'])) # should be False
|
||||
spl_type = '70_30'
|
||||
#spl_type = '80_20'
|
||||
|
@ -143,11 +143,13 @@ from sklearn.utils import all_estimators
|
|||
all_clfs = all_estimators(type_filter="classifier")
|
||||
df = pd.DataFrame (all_clfs, columns = ['classifier_name', 'classifier_fn'])
|
||||
df.to_csv("Model_names_ALL.csv")
|
||||
################################################################
|
||||
#%% TEST different CV Thresholds for split_type = NONE
|
||||
|
||||
################################################################
|
||||
Counter(df2['y'])
|
||||
Counter(df2['y_bts'])
|
||||
|
||||
# READ Data
|
||||
spl_type = 'none'
|
||||
data_type = "complete"
|
||||
|
||||
|
@ -160,13 +162,13 @@ df2 = split_tts(df
|
|||
, include_gene_name = True
|
||||
, random_state = 42 # default
|
||||
)
|
||||
|
||||
fooD = MultModelsCl_noBTS(input_df = df2['X']
|
||||
#%% Trying different CV thresholds for resampling 'none' ONLY
|
||||
fooD = MultModelsCl_CVs(input_df = df2['X']
|
||||
, target = df2['y']
|
||||
, skf_cv_threshold = 10 # IMP to change
|
||||
|
||||
, tts_split_type = spl_type
|
||||
, resampling_type = 'XXXX' # default
|
||||
, resampling_type = 'NONE' # default
|
||||
|
||||
, add_cm = True # adds confusion matrix based on cross_val_predict
|
||||
, add_yn = True # adds target var class numbers
|
||||
|
@ -185,7 +187,7 @@ for k, v in fooD.items():
|
|||
)
|
||||
|
||||
# formatted df
|
||||
foo_df3 = MultModelsCl_noBTS(input_df = df2['X']
|
||||
foo_df3 = MultModelsCl_CVs(input_df = df2['X']
|
||||
, target = df2['y']
|
||||
, skf_cv_threshold = 5 # IMP to change
|
||||
|
||||
|
@ -203,6 +205,7 @@ foo_df3 = MultModelsCl_noBTS(input_df = df2['X']
|
|||
|
||||
)
|
||||
|
||||
|
||||
dfs_combine_wf = [foo_df, foo_df2, foo_df3]
|
||||
|
||||
common_cols_wf = list(set.intersection(*(set(df.columns) for df in dfs_combine_wf)))
|
||||
|
@ -246,3 +249,46 @@ if len(common_cols_wf) == dfs_ncols_wf :
|
|||
, '\nGot:', len(combined_baseline_wf.columns))
|
||||
sys.exit('\nFIRST IF FAILS')
|
||||
|
||||
#%% TRY with dict containing different Resampling types
|
||||
paramD = {
|
||||
'baseline_paramD': { 'input_df' : df2['X']
|
||||
, 'target' : df2['y']
|
||||
, 'var_type' : 'mixed'
|
||||
, 'resampling_type': 'none'}
|
||||
|
||||
, 'smnc_paramD' : { 'input_df' : df2['X_smnc']
|
||||
, 'target' : df2['y_smnc']
|
||||
, 'var_type' : 'mixed'
|
||||
, 'resampling_type' : 'smnc'}
|
||||
}
|
||||
|
||||
mmDD = {}
|
||||
for k, v in paramD.items():
|
||||
print(k)
|
||||
all_scoresDF = pd.DataFrame()
|
||||
for skf_cv_threshold in [3,5]:
|
||||
print('\nRunning CV threhhold:', skf_cv_threshold)
|
||||
current_scoreDF = MultModelsCl_CVs(**paramD[k]
|
||||
, skf_cv_threshold = skf_cv_threshold # IMP to change
|
||||
, tts_split_type = spl_type
|
||||
#, resampling_type = 'XXXX' # default
|
||||
|
||||
, add_cm = True # adds confusion matrix based on cross_val_predict
|
||||
, add_yn = True # adds target var class numbers
|
||||
|
||||
#, var_type = ['mixed']
|
||||
, scale_numeric = ['min_max']
|
||||
, random_state = 42
|
||||
, n_jobs = os.cpu_count()
|
||||
, return_formatted_output = True
|
||||
)
|
||||
|
||||
all_scoresDF = pd.concat([all_scoresDF, current_scoreDF])
|
||||
mmDD[k] = all_scoresDF
|
||||
|
||||
for k, v in mmDD.items():
|
||||
print(k, v)
|
||||
out_wf= pd.concat(mmDD, ignore_index = True)
|
||||
out_wf2= pd.concat(mmDD)
|
||||
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue