From ebef0c796730504d0683732c7ea927f468d6b1ae Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Tue, 5 Jul 2022 16:06:24 +0100 Subject: [PATCH] added test script to see one gene --- .../ml/ml_functions/test_func_singlegene.py | 79 +++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 scripts/ml/ml_functions/test_func_singlegene.py diff --git a/scripts/ml/ml_functions/test_func_singlegene.py b/scripts/ml/ml_functions/test_func_singlegene.py new file mode 100644 index 0000000..d483514 --- /dev/null +++ b/scripts/ml/ml_functions/test_func_singlegene.py @@ -0,0 +1,79 @@ +import pandas as pd +import os, sys +import numpy as np +from sklearn.datasets import load_boston +from sklearn.ensemble import RandomForestRegressor +from sklearn.model_selection import train_test_split +from sklearn.feature_selection import RFECV +import matplotlib.pyplot as plt +############################################################################### +homedir = os.path.expanduser("~") +sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml/ml_functions') +sys.path + +# import +from GetMLData import * +from SplitTTS import * +from MultClfs_fi import * + +#%% +# X,y = load_boston(return_X_y=True) +# features = load_boston()['feature_names'] + +# X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=42) + +# rf = RandomForestRegressor(random_state=0) +# rf.fit(X_train,y_train) + + +# f_i = list(zip(features,rf.feature_importances_)) +# f_i.sort(key = lambda x : x[1]) +# plt.barh([x[0] for x in f_i],[x[1] for x in f_i]) +# plt.show() + +#%% + +sel_cv = StratifiedKFold(n_splits = 10 + , shuffle = True,**rs) +#sel_cv = logo +# sel_cv = RepeatedStratifiedKFold(n_splits = 5 +# , n_repeats = 3 +# , **rs) +# param dict for getmldata() +gene_model_paramD = {'data_combined_model' : False + , 'use_or' : False + , 'omit_all_genomic_features': False + , 'write_maskfile' : False + , 'write_outfile' : False } + +#df = getmldata(gene, drug, **gene_model_paramD) +df = getmldata('pncA', 'pyrazinamide', **gene_model_paramD) + +df2 = split_tts(df + , data_type = 'actual' + , split_type = '70_30' + , oversampling = False + , dst_colname = 'dst' + , target_colname = 'dst_mode' + , include_gene_name = True + , random_state = 42 # default + ) + +all(df2['X'].columns.isin(['gene_name'])) # should be False + +fooD = MultClfs_fi (input_df = df2['X'] + , target = df2['y'] + , sel_cv = sel_cv + , run_blind_test = True + , blind_test_df = df2['X_bts'] + , blind_test_target = df2['y_bts'] + , tts_split_type = '70_30' + , var_type = 'mixed' + , resampling_type = 'none' # default +) + +for k, v in fooD.items(): + print('\nModel:', k + , '\nTRAIN MCC:', fooD[k]['test_mcc'] + , '\nBTS MCC:' , fooD[k]['bts_mcc'] + , '\nDIFF:',fooD[k]['bts_mcc'] - fooD[k]['test_mcc'] ) \ No newline at end of file