added test_func_combined.py
This commit is contained in:
parent
9cd6613da6
commit
26f284d76e
1 changed files with 130 additions and 0 deletions
130
scripts/ml/ml_functions/test_func_combined.py
Normal file
130
scripts/ml/ml_functions/test_func_combined.py
Normal file
|
@ -0,0 +1,130 @@
|
||||||
|
import pandas as pd
|
||||||
|
import os, sys
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.datasets import load_boston
|
||||||
|
from sklearn.ensemble import RandomForestRegressor
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
from sklearn.feature_selection import RFECV
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
###############################################################################
|
||||||
|
homedir = os.path.expanduser("~")
|
||||||
|
sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml/ml_functions')
|
||||||
|
sys.path
|
||||||
|
|
||||||
|
# import
|
||||||
|
from GetMLData import *
|
||||||
|
from SplitTTS import *
|
||||||
|
from MultClfs import *
|
||||||
|
from MultClfs_CVs import *
|
||||||
|
|
||||||
|
#%%
|
||||||
|
rs = {'random_state': 42}
|
||||||
|
skf_cv = StratifiedKFold(n_splits = 10
|
||||||
|
, shuffle = True,**rs)
|
||||||
|
#sel_cv = logo
|
||||||
|
# sel_cv = RepeatedStratifiedKFold(n_splits = 5
|
||||||
|
# , n_repeats = 3
|
||||||
|
# , **rs)
|
||||||
|
# param dict for getmldata()
|
||||||
|
#%% READ data
|
||||||
|
gene_model_paramD = {'data_combined_model' : True
|
||||||
|
, 'use_or' : False
|
||||||
|
, 'omit_all_genomic_features': False
|
||||||
|
, 'write_maskfile' : False
|
||||||
|
, 'write_outfile' : False }
|
||||||
|
|
||||||
|
#df = getmldata(gene, drug, **gene_model_paramD)
|
||||||
|
#df = getmldata('pncA', 'pyrazinamide', **gene_model_paramD)
|
||||||
|
df = getmldata('embB', 'ethambutol' , **gene_model_paramD)
|
||||||
|
#df = getmldata('katG', 'isoniazid' , **gene_model_paramD)
|
||||||
|
#df = getmldata('rpoB', 'rifampicin' , **gene_model_paramD)
|
||||||
|
#df = getmldata('gid' , 'streptomycin' , **gene_model_paramD)
|
||||||
|
#df = getmldata('alr' , 'cycloserine' , **gene_model_paramD)
|
||||||
|
##########################
|
||||||
|
#%% TEST different CV Thresholds for split_type = NONE
|
||||||
|
################################################################
|
||||||
|
Counter(df2['y'])
|
||||||
|
Counter(df2['y_bts'])
|
||||||
|
|
||||||
|
# READ Data
|
||||||
|
spl_type = 'none'
|
||||||
|
data_type = 'complete'
|
||||||
|
|
||||||
|
df2 = split_tts(ml_input_data = combined_df
|
||||||
|
, data_type = data_type
|
||||||
|
, split_type = spl_type
|
||||||
|
, oversampling = True
|
||||||
|
, dst_colname = 'dst'
|
||||||
|
, target_colname = 'dst_mode'
|
||||||
|
, include_gene_name = True
|
||||||
|
, random_state = 42 # default
|
||||||
|
)
|
||||||
|
#%% Trying different CV thresholds for resampling 'none' ONLY
|
||||||
|
fooD = MultModelsCl_CVs(input_df = df2['X']
|
||||||
|
, target = df2['y']
|
||||||
|
, skf_cv_threshold = 10 # IMP to change
|
||||||
|
|
||||||
|
, tts_split_type = spl_type
|
||||||
|
, resampling_type = 'NONE' # default
|
||||||
|
|
||||||
|
, add_cm = True # adds confusion matrix based on cross_val_predict
|
||||||
|
, add_yn = True # adds target var class numbers
|
||||||
|
|
||||||
|
, var_type = ['mixed']
|
||||||
|
, scale_numeric = ['min_max']
|
||||||
|
, random_state = 42
|
||||||
|
, n_jobs = os.cpu_count()
|
||||||
|
, return_formatted_output = False
|
||||||
|
|
||||||
|
)
|
||||||
|
|
||||||
|
for k, v in fooD.items():
|
||||||
|
print('\nModel:', k
|
||||||
|
, '\nTRAIN MCC:', fooD[k]['test_mcc']
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#%% TRY with dict containing different Resampling types
|
||||||
|
paramD = {
|
||||||
|
'baseline_paramD': { 'input_df' : df2['X']
|
||||||
|
, 'target' : df2['y']
|
||||||
|
, 'var_type' : 'mixed'
|
||||||
|
, 'resampling_type': 'none'}
|
||||||
|
|
||||||
|
, 'smnc_paramD' : { 'input_df' : df2['X_smnc']
|
||||||
|
, 'target' : df2['y_smnc']
|
||||||
|
, 'var_type' : 'mixed'
|
||||||
|
, 'resampling_type' : 'smnc'}
|
||||||
|
}
|
||||||
|
|
||||||
|
mmDD = {}
|
||||||
|
for k, v in paramD.items():
|
||||||
|
print(k)
|
||||||
|
all_scoresDF = pd.DataFrame()
|
||||||
|
for skf_cv_threshold in [3,5]:
|
||||||
|
print('\nRunning CV threhhold:', skf_cv_threshold)
|
||||||
|
current_scoreDF = MultModelsCl_CVs(**paramD[k]
|
||||||
|
, skf_cv_threshold = skf_cv_threshold # IMP to change
|
||||||
|
, tts_split_type = spl_type
|
||||||
|
#, resampling_type = 'XXXX' # default
|
||||||
|
|
||||||
|
, add_cm = True # adds confusion matrix based on cross_val_predict
|
||||||
|
, add_yn = True # adds target var class numbers
|
||||||
|
|
||||||
|
#, var_type = ['mixed']
|
||||||
|
, scale_numeric = ['min_max']
|
||||||
|
, random_state = 42
|
||||||
|
, n_jobs = os.cpu_count()
|
||||||
|
, return_formatted_output = True
|
||||||
|
)
|
||||||
|
|
||||||
|
all_scoresDF = pd.concat([all_scoresDF, current_scoreDF])
|
||||||
|
mmDD[k] = all_scoresDF
|
||||||
|
|
||||||
|
for k, v in mmDD.items():
|
||||||
|
print(k, v)
|
||||||
|
out_wf= pd.concat(mmDD, ignore_index = True)
|
||||||
|
out_wf2= pd.concat(mmDD)
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue