LSHTM_analysis/scripts/ml/ml_functions/test_func_singlegene.py

142 lines
4.6 KiB
Python

import pandas as pd
import os, sys
import numpy as np
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFECV
import matplotlib.pyplot as plt
###############################################################################
homedir = os.path.expanduser("~")
sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml/ml_functions')
sys.path
# import
from GetMLData import *
from SplitTTS import *
from MultClfs import *
#from MultClfs_SIMPLE import *
#%%
rs = {'random_state': 42}
skf_cv = StratifiedKFold(n_splits = 10
, shuffle = True,**rs)
#sel_cv = logo
# sel_cv = RepeatedStratifiedKFold(n_splits = 5
# , n_repeats = 3
# , **rs)
# param dict for getmldata()
gene_model_paramD = {'data_combined_model' : False
, 'use_or' : False
, 'omit_all_genomic_features': False
, 'write_maskfile' : False
, 'write_outfile' : False }
#df = getmldata(gene, drug, **gene_model_paramD)
#df = getmldata('pncA', 'pyrazinamide', **gene_model_paramD)
df = getmldata('embB', 'ethambutol' , **gene_model_paramD)
#df = getmldata('katG', 'isoniazid' , **gene_model_paramD)
#df = getmldata('rpoB', 'rifampicin' , **gene_model_paramD)
#df = getmldata('gid' , 'streptomycin' , **gene_model_paramD)
#df = getmldata('alr' , 'cycloserine' , **gene_model_paramD)
all(df.columns.isin(['gene_name'])) # should be False
spl_type = '70_30'
#spl_type = '80_20'
#spl_type = 'sl'
#data_type = "actual"
data_type = "complete"
df2 = split_tts(df
, data_type = data_type
, split_type = spl_type
, oversampling = True
, dst_colname = 'dst'
, target_colname = 'dst_mode'
, include_gene_name = True
, random_state = 42 # default
)
all(df2['X'].columns.isin(['gene_name'])) # should be False
df['dst'].value_counts()
df['dst'].isna().sum()
df['dst_mode'].value_counts()
len(df)
Counter(df2['y'])
Counter(df2['y_bts'])
fooD = MultModelsCl(input_df = df2['X']
, target = df2['y']
, sel_cv = skf_cv
, run_blind_test = True
, blind_test_df = df2['X_bts']
, blind_test_target = df2['y_bts']
, tts_split_type = spl_type
, resampling_type = 'XXXX' # default
, var_type = ['mixed']
, scale_numeric = ['min_max']
, return_formatted_output = False
)
for k, v in fooD.items():
print('\nModel:', k
, '\nTRAIN MCC:', fooD[k]['test_mcc']
, '\nBTS MCC:' , fooD[k]['bts_mcc']
, '\nDIFF:',fooD[k]['bts_mcc'] - fooD[k]['test_mcc'] )
for k, v in fooD.items():
print('\nModel:', k
, '\nTRAIN ACCURACY:', fooD[k]['test_accuracy']
, '\nBTS ACCURACY:' , fooD[k]['bts_accuracy']
, '\nDIFF:',fooD[k]['bts_accuracy'] - fooD[k]['test_accuracy'] )
#%% CHECK SCALING
embb_df = getmldata('embB', 'ethambutol' , **combined_model_paramD)
all(embb_df.columns.isin(['gene_name'])) # should be False
scaler = MinMaxScaler(feature_range=(-1, 1))
bar = embb_df[['vdwclashes_rr', 'electro_rr']]
bar_df1 = scaler.fit_transform(bar)
bar_df1 = pd.DataFrame(bar_df1)
bar_df1.rename(columns = {0:'vdw_scaled', 1: 'ele_scaled'}, inplace = True)
bar2 = pd.concat([bar, bar_df1], axis = 1)
scaler2 = StandardScaler()
baz = embb_df[['vdwclashes_rr', 'electro_rr']]
baz_df1 = scaler2.fit_transform(baz)
baz_df1 = pd.DataFrame(baz_df1)
baz_df1.rename(columns = {0:'vdw_scaled', 1: 'ele_scaled'}, inplace = True)
baz2 = pd.concat([baz, baz_df1], axis = 1)
a = pd.concat([bar2, baz2], axis = 1)
#%% test added split_types i.e none_with_bts and none_only
spl_type = 'none_only'
spl_type = 'none_with_bts'
spl_type = 'rt'
#data_type = "actual"
data_type = "complete"
df2 = split_tts(df
, data_type = data_type # only works with complete despite what you set to
, split_type = spl_type
, oversampling = True
, dst_colname = 'dst'
, target_colname = 'dst_mode'
, include_gene_name = True
, random_state = 42 # default
)
all(df2['X'].columns.isin(['gene_name'])) # should be False
import pandas as pd
from sklearn.utils import all_estimators
all_clfs = all_estimators(type_filter="classifier")
df = pd.DataFrame (all_clfs, columns = ['classifier_name', 'classifier_fn'])
df.to_csv("Model_names_ALL.csv")