added test script to see one gene
This commit is contained in:
parent
79cb89a019
commit
ebef0c7967
1 changed files with 79 additions and 0 deletions
79
scripts/ml/ml_functions/test_func_singlegene.py
Normal file
79
scripts/ml/ml_functions/test_func_singlegene.py
Normal file
|
@ -0,0 +1,79 @@
|
|||
import pandas as pd
|
||||
import os, sys
|
||||
import numpy as np
|
||||
from sklearn.datasets import load_boston
|
||||
from sklearn.ensemble import RandomForestRegressor
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.feature_selection import RFECV
|
||||
import matplotlib.pyplot as plt
|
||||
###############################################################################
|
||||
homedir = os.path.expanduser("~")
|
||||
sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml/ml_functions')
|
||||
sys.path
|
||||
|
||||
# import
|
||||
from GetMLData import *
|
||||
from SplitTTS import *
|
||||
from MultClfs_fi import *
|
||||
|
||||
#%%
|
||||
# X,y = load_boston(return_X_y=True)
|
||||
# features = load_boston()['feature_names']
|
||||
|
||||
# X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=42)
|
||||
|
||||
# rf = RandomForestRegressor(random_state=0)
|
||||
# rf.fit(X_train,y_train)
|
||||
|
||||
|
||||
# f_i = list(zip(features,rf.feature_importances_))
|
||||
# f_i.sort(key = lambda x : x[1])
|
||||
# plt.barh([x[0] for x in f_i],[x[1] for x in f_i])
|
||||
# plt.show()
|
||||
|
||||
#%%
|
||||
|
||||
sel_cv = StratifiedKFold(n_splits = 10
|
||||
, shuffle = True,**rs)
|
||||
#sel_cv = logo
|
||||
# sel_cv = RepeatedStratifiedKFold(n_splits = 5
|
||||
# , n_repeats = 3
|
||||
# , **rs)
|
||||
# param dict for getmldata()
|
||||
gene_model_paramD = {'data_combined_model' : False
|
||||
, 'use_or' : False
|
||||
, 'omit_all_genomic_features': False
|
||||
, 'write_maskfile' : False
|
||||
, 'write_outfile' : False }
|
||||
|
||||
#df = getmldata(gene, drug, **gene_model_paramD)
|
||||
df = getmldata('pncA', 'pyrazinamide', **gene_model_paramD)
|
||||
|
||||
df2 = split_tts(df
|
||||
, data_type = 'actual'
|
||||
, split_type = '70_30'
|
||||
, oversampling = False
|
||||
, dst_colname = 'dst'
|
||||
, target_colname = 'dst_mode'
|
||||
, include_gene_name = True
|
||||
, random_state = 42 # default
|
||||
)
|
||||
|
||||
all(df2['X'].columns.isin(['gene_name'])) # should be False
|
||||
|
||||
fooD = MultClfs_fi (input_df = df2['X']
|
||||
, target = df2['y']
|
||||
, sel_cv = sel_cv
|
||||
, run_blind_test = True
|
||||
, blind_test_df = df2['X_bts']
|
||||
, blind_test_target = df2['y_bts']
|
||||
, tts_split_type = '70_30'
|
||||
, var_type = 'mixed'
|
||||
, resampling_type = 'none' # default
|
||||
)
|
||||
|
||||
for k, v in fooD.items():
|
||||
print('\nModel:', k
|
||||
, '\nTRAIN MCC:', fooD[k]['test_mcc']
|
||||
, '\nBTS MCC:' , fooD[k]['bts_mcc']
|
||||
, '\nDIFF:',fooD[k]['bts_mcc'] - fooD[k]['test_mcc'] )
|
Loading…
Add table
Add a link
Reference in a new issue