added combined_model_iterator.py that has oversampling

This commit is contained in:
Tanushree Tunstall 2022-09-02 09:50:51 +01:00
parent 338dd329e9
commit c845d96102
3 changed files with 332 additions and 94 deletions

View file

@ -17,6 +17,11 @@ from SplitTTS import *
from MultClfs import *
from MultClfs_CVs import *
#====================
# Import ML functions
#====================
from ml_data_combined import *
#%%
rs = {'random_state': 42}
skf_cv = StratifiedKFold(n_splits = 10
@ -35,7 +40,7 @@ gene_model_paramD = {'data_combined_model' : True
#df = getmldata(gene, drug, **gene_model_paramD)
#df = getmldata('pncA', 'pyrazinamide', **gene_model_paramD)
df = getmldata('embB', 'ethambutol' , **gene_model_paramD)
#df = getmldata('embB', 'ethambutol' , **gene_model_paramD)
#df = getmldata('katG', 'isoniazid' , **gene_model_paramD)
#df = getmldata('rpoB', 'rifampicin' , **gene_model_paramD)
#df = getmldata('gid' , 'streptomycin' , **gene_model_paramD)
@ -43,9 +48,6 @@ df = getmldata('embB', 'ethambutol' , **gene_model_paramD)
##########################
#%% TEST different CV Thresholds for split_type = NONE
################################################################
Counter(df2['y'])
Counter(df2['y_bts'])
# READ Data
spl_type = 'none'
data_type = 'complete'
@ -59,6 +61,9 @@ df2 = split_tts(ml_input_data = combined_df
, include_gene_name = True
, random_state = 42 # default
)
Counter(df2['y'])
Counter(df2['y_bts'])
#%% Trying different CV thresholds for resampling 'none' ONLY
fooD = MultModelsCl_CVs(input_df = df2['X']
, target = df2['y']
@ -80,7 +85,8 @@ fooD = MultModelsCl_CVs(input_df = df2['X']
for k, v in fooD.items():
print('\nModel:', k
, '\nTRAIN MCC:', fooD[k]['test_mcc']
, '\nTRAIN MCC:', fooD[k]['train_mcc']
, '\nCV MCC:', fooD[k]['test_mcc']
)