248 lines
8.1 KiB
Python
248 lines
8.1 KiB
Python
import pandas as pd
|
|
import os, sys
|
|
import numpy as np
|
|
from sklearn.datasets import load_boston
|
|
from sklearn.ensemble import RandomForestRegressor
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.feature_selection import RFECV
|
|
import matplotlib.pyplot as plt
|
|
###############################################################################
|
|
homedir = os.path.expanduser("~")
|
|
sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml/ml_functions')
|
|
sys.path
|
|
|
|
# import
|
|
from GetMLData import *
|
|
from SplitTTS import *
|
|
from MultClfs import *
|
|
from MultClfs_noBTS import *
|
|
|
|
|
|
#%%
|
|
rs = {'random_state': 42}
|
|
skf_cv = StratifiedKFold(n_splits = 10
|
|
, shuffle = True,**rs)
|
|
#sel_cv = logo
|
|
# sel_cv = RepeatedStratifiedKFold(n_splits = 5
|
|
# , n_repeats = 3
|
|
# , **rs)
|
|
# param dict for getmldata()
|
|
gene_model_paramD = {'data_combined_model' : False
|
|
, 'use_or' : False
|
|
, 'omit_all_genomic_features': False
|
|
, 'write_maskfile' : False
|
|
, 'write_outfile' : False }
|
|
|
|
#df = getmldata(gene, drug, **gene_model_paramD)
|
|
#df = getmldata('pncA', 'pyrazinamide', **gene_model_paramD)
|
|
df = getmldata('embB', 'ethambutol' , **gene_model_paramD)
|
|
#df = getmldata('katG', 'isoniazid' , **gene_model_paramD)
|
|
#df = getmldata('rpoB', 'rifampicin' , **gene_model_paramD)
|
|
#df = getmldata('gid' , 'streptomycin' , **gene_model_paramD)
|
|
#df = getmldata('alr' , 'cycloserine' , **gene_model_paramD)
|
|
|
|
all(df.columns.isin(['gene_name'])) # should be False
|
|
spl_type = '70_30'
|
|
#spl_type = '80_20'
|
|
#spl_type = 'sl'
|
|
|
|
#data_type = "actual"
|
|
data_type = "complete"
|
|
|
|
df2 = split_tts(df
|
|
, data_type = data_type
|
|
, split_type = spl_type
|
|
, oversampling = True
|
|
, dst_colname = 'dst'
|
|
, target_colname = 'dst_mode'
|
|
, include_gene_name = True
|
|
, random_state = 42 # default
|
|
)
|
|
|
|
all(df2['X'].columns.isin(['gene_name'])) # should be False
|
|
|
|
df['dst'].value_counts()
|
|
df['dst'].isna().sum()
|
|
df['dst_mode'].value_counts()
|
|
|
|
len(df)
|
|
|
|
Counter(df2['y'])
|
|
Counter(df2['y_bts'])
|
|
|
|
#%% Run Multiple models
|
|
|
|
fooD = MultModelsCl(input_df = df2['X']
|
|
, target = df2['y']
|
|
, sel_cv = skf_cv
|
|
, run_blind_test = True
|
|
, blind_test_df = df2['X_bts']
|
|
, blind_test_target = df2['y_bts']
|
|
, tts_split_type = spl_type
|
|
, resampling_type = 'XXXX' # default
|
|
, var_type = ['mixed']
|
|
, scale_numeric = ['min_max']
|
|
, return_formatted_output = False
|
|
)
|
|
|
|
for k, v in fooD.items():
|
|
print('\nModel:', k
|
|
, '\nTRAIN MCC:', fooD[k]['test_mcc']
|
|
, '\nBTS MCC:' , fooD[k]['bts_mcc']
|
|
, '\nDIFF:',fooD[k]['bts_mcc'] - fooD[k]['test_mcc'] )
|
|
|
|
for k, v in fooD.items():
|
|
print('\nModel:', k
|
|
, '\nTRAIN ACCURACY:', fooD[k]['test_accuracy']
|
|
, '\nBTS ACCURACY:' , fooD[k]['bts_accuracy']
|
|
, '\nDIFF:',fooD[k]['bts_accuracy'] - fooD[k]['test_accuracy'] )
|
|
#%% CHECK SCALING
|
|
embb_df = getmldata('embB', 'ethambutol' , **combined_model_paramD)
|
|
all(embb_df.columns.isin(['gene_name'])) # should be False
|
|
|
|
scaler = MinMaxScaler(feature_range=(-1, 1))
|
|
bar = embb_df[['vdwclashes_rr', 'electro_rr']]
|
|
bar_df1 = scaler.fit_transform(bar)
|
|
bar_df1 = pd.DataFrame(bar_df1)
|
|
bar_df1.rename(columns = {0:'vdw_scaled', 1: 'ele_scaled'}, inplace = True)
|
|
bar2 = pd.concat([bar, bar_df1], axis = 1)
|
|
|
|
|
|
scaler2 = StandardScaler()
|
|
baz = embb_df[['vdwclashes_rr', 'electro_rr']]
|
|
baz_df1 = scaler2.fit_transform(baz)
|
|
baz_df1 = pd.DataFrame(baz_df1)
|
|
baz_df1.rename(columns = {0:'vdw_scaled', 1: 'ele_scaled'}, inplace = True)
|
|
baz2 = pd.concat([baz, baz_df1], axis = 1)
|
|
|
|
a = pd.concat([bar2, baz2], axis = 1)
|
|
|
|
#%% test added split_types i.e none_with_bts and none_only
|
|
spl_type = 'none_only'
|
|
spl_type = 'none_with_bts'
|
|
spl_type = 'rt'
|
|
|
|
#data_type = "actual"
|
|
data_type = "complete"
|
|
|
|
df2 = split_tts(df
|
|
, data_type = data_type # only works with complete despite what you set to
|
|
, split_type = spl_type
|
|
, oversampling = True
|
|
, dst_colname = 'dst'
|
|
, target_colname = 'dst_mode'
|
|
, include_gene_name = True
|
|
, random_state = 42 # default
|
|
)
|
|
|
|
all(df2['X'].columns.isin(['gene_name'])) # should be False
|
|
|
|
import pandas as pd
|
|
from sklearn.utils import all_estimators
|
|
|
|
all_clfs = all_estimators(type_filter="classifier")
|
|
df = pd.DataFrame (all_clfs, columns = ['classifier_name', 'classifier_fn'])
|
|
df.to_csv("Model_names_ALL.csv")
|
|
#%% TEST different CV Thresholds for split_type = NONE
|
|
|
|
Counter(df2['y'])
|
|
Counter(df2['y_bts'])
|
|
|
|
spl_type = 'none'
|
|
data_type = "complete"
|
|
|
|
df2 = split_tts(df
|
|
, data_type = data_type
|
|
, split_type = spl_type
|
|
, oversampling = True
|
|
, dst_colname = 'dst'
|
|
, target_colname = 'dst_mode'
|
|
, include_gene_name = True
|
|
, random_state = 42 # default
|
|
)
|
|
|
|
fooD = MultModelsCl_noBTS(input_df = df2['X']
|
|
, target = df2['y']
|
|
, skf_cv_threshold = 10 # IMP to change
|
|
|
|
, tts_split_type = spl_type
|
|
, resampling_type = 'XXXX' # default
|
|
|
|
, add_cm = True # adds confusion matrix based on cross_val_predict
|
|
, add_yn = True # adds target var class numbers
|
|
|
|
, var_type = ['mixed']
|
|
, scale_numeric = ['min_max']
|
|
, random_state = 42
|
|
, n_jobs = os.cpu_count()
|
|
, return_formatted_output = False
|
|
|
|
)
|
|
|
|
for k, v in fooD.items():
|
|
print('\nModel:', k
|
|
, '\nTRAIN MCC:', fooD[k]['test_mcc']
|
|
)
|
|
|
|
# formatted df
|
|
foo_df3 = MultModelsCl_noBTS(input_df = df2['X']
|
|
, target = df2['y']
|
|
, skf_cv_threshold = 5 # IMP to change
|
|
|
|
, tts_split_type = spl_type
|
|
, resampling_type = 'XXXX' # default
|
|
|
|
, add_cm = True # adds confusion matrix based on cross_val_predict
|
|
, add_yn = True # adds target var class numbers
|
|
|
|
, var_type = ['mixed']
|
|
, scale_numeric = ['min_max']
|
|
, random_state = 42
|
|
, n_jobs = os.cpu_count()
|
|
, return_formatted_output = True
|
|
|
|
)
|
|
|
|
dfs_combine_wf = [foo_df, foo_df2, foo_df3]
|
|
|
|
common_cols_wf = list(set.intersection(*(set(df.columns) for df in dfs_combine_wf)))
|
|
|
|
print('\nCombinig', len(dfs_combine_wf), 'using pd.concat by row ~ rowbind'
|
|
, '\nChecking Dims of df to combine:'
|
|
, '\nDim of CV:', scoresDF_CV.shape
|
|
, '\nDim of BT:', scoresDF_BT.shape)
|
|
#print(scoresDF_CV)
|
|
#print(scoresDF_BT)
|
|
|
|
dfs_nrows_wf = []
|
|
for df in dfs_combine_wf:
|
|
dfs_nrows_wf = dfs_nrows_wf + [len(df)]
|
|
dfs_nrows_wf = max(dfs_nrows_wf)
|
|
|
|
dfs_ncols_wf = []
|
|
for df in dfs_combine_wf:
|
|
dfs_ncols_wf = dfs_ncols_wf + [len(df.columns)]
|
|
dfs_ncols_wf = max(dfs_ncols_wf)
|
|
print(dfs_ncols_wf)
|
|
|
|
expected_nrows_wf = len(dfs_combine_wf) * dfs_nrows_wf
|
|
expected_ncols_wf = dfs_ncols_wf
|
|
|
|
if len(common_cols_wf) == dfs_ncols_wf :
|
|
combined_baseline_wf = pd.concat([df[common_cols_wf] for df in dfs_combine_wf], ignore_index=False)
|
|
print('\nConcatenating dfs with different resampling methods [WF]:'
|
|
, '\nSplit type:', spl_type
|
|
, '\nNo. of dfs combining:', len(dfs_combine_wf))
|
|
|
|
if len(combined_baseline_wf) == expected_nrows_wf and len(combined_baseline_wf.columns) == expected_ncols_wf:
|
|
print('\nPASS:', len(dfs_combine_wf), 'dfs successfully combined'
|
|
, '\nnrows in combined_df_wf:', len(combined_baseline_wf)
|
|
, '\nncols in combined_df_wf:', len(combined_baseline_wf.columns))
|
|
else:
|
|
print('\nFAIL: concatenating failed'
|
|
, '\nExpected nrows:', expected_nrows_wf
|
|
, '\nGot:', len(combined_baseline_wf)
|
|
, '\nExpected ncols:', expected_ncols_wf
|
|
, '\nGot:', len(combined_baseline_wf.columns))
|
|
sys.exit('\nFIRST IF FAILS')
|
|
|