LSHTM_analysis/scripts/ml/ml_functions/test_func_singlegene.py

import pandas as pd
import os, sys
import numpy as np
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFECV
import matplotlib.pyplot as plt
###############################################################################
homedir = os.path.expanduser("~")
sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml/ml_functions')
sys.path

# import
from GetMLData import *
from SplitTTS import *
from MultClfs import *
from MultClfs_CVs import *

#%%
rs = {'random_state': 42}
skf_cv = StratifiedKFold(n_splits = 10
                            , shuffle = True,**rs)
#sel_cv = logo
# sel_cv = RepeatedStratifiedKFold(n_splits = 5
#                                   , n_repeats = 3
#                                   , **rs)
# param dict for getmldata()
#%% READ data
gene_model_paramD = {'data_combined_model'       : False
                    , 'use_or'                   : False
                    , 'omit_all_genomic_features': False
                    , 'write_maskfile'           : False
                    , 'write_outfile'            : False }

#df = getmldata(gene, drug, **gene_model_paramD)
#df = getmldata('pncA', 'pyrazinamide', **gene_model_paramD)
df = getmldata('embB', 'ethambutol'   , **gene_model_paramD)
#df = getmldata('katG', 'isoniazid'    , **gene_model_paramD)
#df = getmldata('rpoB', 'rifampicin'   , **gene_model_paramD)
#df  = getmldata('gid' , 'streptomycin' , **gene_model_paramD)
#df  = getmldata('alr' , 'cycloserine'  , **gene_model_paramD)
#%% SPLIT, Data and Resampling types
all(df.columns.isin(['gene_name'])) # should be False
spl_type = '70_30'
#spl_type = '80_20'
#spl_type = 'sl'

#data_type  = "actual"
data_type = "complete"

df2 = split_tts(df
          , data_type = data_type
          , split_type = spl_type
          , oversampling = True
          , dst_colname = 'dst'
          , target_colname = 'dst_mode'
          , include_gene_name = True
          , random_state = 42 # default
      )

all(df2['X'].columns.isin(['gene_name'])) # should be False

df['dst'].value_counts()
df['dst'].isna().sum()
df['dst_mode'].value_counts()

len(df)

Counter(df2['y'])
Counter(df2['y_bts'])

#%% Run Multiple models

fooD = MultModelsCl(input_df = df2['X']
                , target = df2['y']
                , sel_cv = skf_cv
                , run_blind_test = True
                , blind_test_df =  df2['X_bts']
                , blind_test_target =  df2['y_bts']
                , tts_split_type  = spl_type
                , resampling_type = 'XXXX' # default
                , var_type = ['mixed']
                , scale_numeric = ['min_max']
                , return_formatted_output = False
                )

for k, v in fooD.items():
    print('\nModel:', k
          , '\nTRAIN MCC:', fooD[k]['test_mcc']
          , '\nBTS MCC:' , fooD[k]['bts_mcc']
          , '\nDIFF:',fooD[k]['bts_mcc'] - fooD[k]['test_mcc'] )

for k, v in fooD.items():
    print('\nModel:', k
          , '\nTRAIN ACCURACY:', fooD[k]['test_accuracy']
          , '\nBTS ACCURACY:' , fooD[k]['bts_accuracy']
          , '\nDIFF:',fooD[k]['bts_accuracy'] - fooD[k]['test_accuracy'] )
#%% CHECK SCALING
embb_df = getmldata('embB', 'ethambutol'   , **combined_model_paramD)
all(embb_df.columns.isin(['gene_name'])) # should be False

scaler = MinMaxScaler(feature_range=(-1, 1))
bar = embb_df[['vdwclashes_rr', 'electro_rr']]
bar_df1 = scaler.fit_transform(bar)
bar_df1 = pd.DataFrame(bar_df1)
bar_df1.rename(columns = {0:'vdw_scaled', 1: 'ele_scaled'}, inplace = True)
bar2 = pd.concat([bar, bar_df1], axis = 1)


scaler2 = StandardScaler()
baz = embb_df[['vdwclashes_rr', 'electro_rr']]
baz_df1 = scaler2.fit_transform(baz)
baz_df1 = pd.DataFrame(baz_df1)
baz_df1.rename(columns = {0:'vdw_scaled', 1: 'ele_scaled'}, inplace = True)
baz2 = pd.concat([baz, baz_df1], axis = 1)

a = pd.concat([bar2, baz2], axis = 1)

#%% test added split_types i.e none_with_bts and none_only
spl_type = 'none_only'
spl_type = 'none_with_bts'
spl_type = 'rt'

#data_type  = "actual"
data_type = "complete"

df2 = split_tts(df
          , data_type = data_type # only works with complete despite what you set to
          , split_type = spl_type
          , oversampling = True
          , dst_colname = 'dst'
          , target_colname = 'dst_mode'
          , include_gene_name = True
          , random_state = 42 # default
      )

all(df2['X'].columns.isin(['gene_name'])) # should be False

import pandas as pd
from sklearn.utils import all_estimators

all_clfs = all_estimators(type_filter="classifier")
df = pd.DataFrame (all_clfs, columns = ['classifier_name', 'classifier_fn'])
df.to_csv("Model_names_ALL.csv")
################################################################
#%% TEST different CV Thresholds for split_type = NONE
################################################################
Counter(df2['y'])
Counter(df2['y_bts'])

# READ Data
spl_type = 'none'
data_type = "complete"

df2 = split_tts(df
          , data_type = data_type
          , split_type = spl_type
          , oversampling = True
          , dst_colname = 'dst'
          , target_colname = 'dst_mode'
          , include_gene_name = True
          , random_state = 42 # default
      )
#%% Trying different CV thresholds for resampling 'none' ONLY
fooD = MultModelsCl_CVs(input_df = df2['X']
                , target = df2['y']
                , skf_cv_threshold = 10 # IMP to change

                , tts_split_type  = spl_type
                , resampling_type = 'NONE' # default

                , add_cm = True  # adds confusion matrix based on cross_val_predict
                , add_yn = True  # adds target var class numbers

                , var_type = ['mixed']
                , scale_numeric = ['min_max']
                , random_state = 42
                , n_jobs = os.cpu_count()
                , return_formatted_output = False

                )

for k, v in fooD.items():
    print('\nModel:', k
          , '\nTRAIN MCC:', fooD[k]['test_mcc']
          )

# formatted df
foo_df3 = MultModelsCl_CVs(input_df = df2['X']
                , target = df2['y']
                , skf_cv_threshold = 5 # IMP to change

                , tts_split_type  = spl_type
                , resampling_type = 'XXXX' # default

                , add_cm = True  # adds confusion matrix based on cross_val_predict
                , add_yn = True  # adds target var class numbers

                , var_type = ['mixed']
                , scale_numeric = ['min_max']
                , random_state = 42
                , n_jobs = os.cpu_count()
                , return_formatted_output = True

                )


dfs_combine_wf = [foo_df, foo_df2, foo_df3]

common_cols_wf = list(set.intersection(*(set(df.columns) for df in dfs_combine_wf)))

print('\nCombinig', len(dfs_combine_wf), 'using pd.concat by row ~ rowbind'
      , '\nChecking Dims of df to combine:'
      , '\nDim of CV:', scoresDF_CV.shape
      , '\nDim of BT:', scoresDF_BT.shape)
#print(scoresDF_CV)
        #print(scoresDF_BT)

dfs_nrows_wf = []
for df in dfs_combine_wf:
    dfs_nrows_wf = dfs_nrows_wf + [len(df)]
dfs_nrows_wf = max(dfs_nrows_wf)

dfs_ncols_wf = []
for df in dfs_combine_wf:
    dfs_ncols_wf = dfs_ncols_wf + [len(df.columns)]
dfs_ncols_wf = max(dfs_ncols_wf)
print(dfs_ncols_wf)

expected_nrows_wf = len(dfs_combine_wf) * dfs_nrows_wf
expected_ncols_wf = dfs_ncols_wf

if len(common_cols_wf) == dfs_ncols_wf :
    combined_baseline_wf = pd.concat([df[common_cols_wf] for df in dfs_combine_wf], ignore_index=False)
    print('\nConcatenating dfs with different resampling methods [WF]:'
          , '\nSplit type:', spl_type
          , '\nNo. of dfs combining:', len(dfs_combine_wf))

    if len(combined_baseline_wf) == expected_nrows_wf  and len(combined_baseline_wf.columns) == expected_ncols_wf:
        print('\nPASS:', len(dfs_combine_wf), 'dfs successfully combined'
              , '\nnrows in combined_df_wf:', len(combined_baseline_wf)
              , '\nncols in combined_df_wf:', len(combined_baseline_wf.columns))
    else:
        print('\nFAIL: concatenating failed'
              , '\nExpected nrows:', expected_nrows_wf
              , '\nGot:', len(combined_baseline_wf)
              , '\nExpected ncols:', expected_ncols_wf
              , '\nGot:', len(combined_baseline_wf.columns))
        sys.exit('\nFIRST IF FAILS')

#%% TRY with dict containing different Resampling types
paramD = {
        'baseline_paramD': { 'input_df'        : df2['X']
                            , 'target'         : df2['y']
                            , 'var_type'       : 'mixed'
                            , 'resampling_type': 'none'}

        , 'smnc_paramD'  : { 'input_df'        : df2['X_smnc']
                           , 'target'          : df2['y_smnc']
                           , 'var_type'        : 'mixed'
                           , 'resampling_type' : 'smnc'}
        }

mmDD = {}
for k, v in paramD.items():
    print(k)
    all_scoresDF = pd.DataFrame()
    for skf_cv_threshold in [3,5]:
        print('\nRunning CV threhhold:', skf_cv_threshold)
        current_scoreDF = MultModelsCl_CVs(**paramD[k]
                            , skf_cv_threshold = skf_cv_threshold # IMP to change
                            , tts_split_type   = spl_type
                            #, resampling_type = 'XXXX' # default

                            , add_cm = True  # adds confusion matrix based on cross_val_predict
                            , add_yn = True  # adds target var class numbers

                            #, var_type = ['mixed']
                            , scale_numeric = ['min_max']
                            , random_state = 42
                            , n_jobs = os.cpu_count()
                            , return_formatted_output = True
                            )

        all_scoresDF = pd.concat([all_scoresDF, current_scoreDF])
    mmDD[k] = all_scoresDF

for k, v in mmDD.items():
    print(k, v)
    out_wf= pd.concat(mmDD, ignore_index = True)
    out_wf2= pd.concat(mmDD)