From 8d831f3613fc73b98d59d52e28fa5631df55ad1c Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Tue, 5 Jul 2022 22:47:13 +0100
Subject: [PATCH] added different scaling options

---
 scripts/ml/ml_functions/MultClfs.py           | 74 +++++++++++++------
 scripts/ml/ml_functions/ml_data_combined.py   |  2 +-
 .../ml/ml_functions/test_func_singlegene.py   | 54 +++++++++++---
 3 files changed, 99 insertions(+), 31 deletions(-)

diff --git a/scripts/ml/ml_functions/MultClfs.py b/scripts/ml/ml_functions/MultClfs.py
index 688caf3..290c06a 100755
--- a/scripts/ml/ml_functions/MultClfs.py
+++ b/scripts/ml/ml_functions/MultClfs.py
@@ -142,7 +142,9 @@ scoreBT_mapD = {'bts_mcc'          : 'MCC'
 # Run Multiple Classifiers
 ############################
 # Multiple Classification - Model Pipeline
-def MultModelsCl(input_df, target, skf_cv
+def MultModelsCl(input_df, target
+                       #, skf_cv
+                       , sel_cv
                        , blind_test_df
                        , blind_test_target
                        , tts_split_type 
@@ -150,7 +152,8 @@ def MultModelsCl(input_df, target, skf_cv
                        , resampling_type = 'none' # default
                        , add_cm = True # adds confusion matrix based on cross_val_predict
                        , add_yn = True  # adds target var class numbers
-                       , var_type = ['numerical', 'categorical','mixed']
+                       , var_type = ['numerical', 'categorical','mixed'] 
+                       , scale_numeric = ['min_max', 'std', 'min_max_neg', 'none'] 
                        , run_blind_test = True
                        , return_formatted_output = True):
 
@@ -182,24 +185,52 @@ def MultModelsCl(input_df, target, skf_cv
     #======================================================
     # Determine preprocessing steps ~ var_type
     #======================================================
-    if var_type == 'numerical':
-        t = [('num', MinMaxScaler(), numerical_ix)]
+   
+    # if var_type == 'numerical':
+    #     t = [('num', MinMaxScaler(), numerical_ix)]
 
+    # if var_type == 'categorical':
+    #     t = [('cat', OneHotEncoder(), categorical_ix)]
+    
+    # # if var_type == 'mixed':
+    # #     t = [('num', MinMaxScaler(), numerical_ix)
+    # #         , ('cat', OneHotEncoder(), categorical_ix) ]
+
+    # if var_type == 'mixed':
+    #     t = [('cat', OneHotEncoder(), categorical_ix) ]
+    if type(var_type) == list: 
+        var_type = str(var_type[0])
+    else:
+        var_type = var_type
+    
+    if var_type in ['numerical','mixed']:
+        if scale_numeric == ['none']:
+            t = [('cat', OneHotEncoder(), categorical_ix)]
+        if scale_numeric != ['none']:
+            if scale_numeric == ['min_max']:
+                scaler = MinMaxScaler()
+            if scale_numeric == ['min_max_neg']:
+                scaler = MinMaxScaler(feature_range=(-1, 1))
+            if scale_numeric == ['std']:
+                scaler = StandardScaler()
+            
+            t = [('num', scaler, numerical_ix)
+              , ('cat', OneHotEncoder(), categorical_ix)]
+        
+                
     if var_type == 'categorical':
         t = [('cat', OneHotEncoder(), categorical_ix)]
-    
-    if var_type == 'mixed':
-        t = [('num', MinMaxScaler(), numerical_ix)
-            , ('cat', OneHotEncoder(), categorical_ix) ]
+
         
     col_transform = ColumnTransformer(transformers = t
                                        , remainder='passthrough')
     
+    
     #======================================================
     # Specify multiple Classification Models  
     #======================================================
     models = [('AdaBoost Classifier'     , AdaBoostClassifier(**rs) )
-               , ('Bagging Classifier'        , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True) )
+               , ('Bagging Classifier'        , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True, verbose = 3, n_estimators = 100) )
                , ('Decision Tree'             , DecisionTreeClassifier(**rs) ) 
                , ('Extra Tree'                , ExtraTreeClassifier(**rs) )
                , ('Extra Trees'               , ExtraTreesClassifier(**rs) ) 
@@ -211,18 +242,18 @@ def MultModelsCl(input_df, target, skf_cv
                , ('Logistic Regression'       , LogisticRegression(**rs) )
                , ('Logistic RegressionCV'     , LogisticRegressionCV(cv = 3, **rs))
                , ('MLP'                       , MLPClassifier(max_iter = 500, **rs) ) 
-               , ('Multinomial'               , MultinomialNB() )
+               #, ('Multinomial'               , MultinomialNB() )
                , ('Naive Bayes'               , BernoulliNB() )
                , ('Passive Aggresive'         , PassiveAggressiveClassifier(**rs, **njobs) )
                , ('QDA'                       , QuadraticDiscriminantAnalysis() )
                , ('Random Forest'             , RandomForestClassifier(**rs, n_estimators = 1000, **njobs ) ) 
-               , ('Random Forest2'            , RandomForestClassifier(min_samples_leaf = 5
-                                                                       , n_estimators     = 1000
-                                                                       , bootstrap        = True
-                                                                       , oob_score        = True
-                                                                       , **njobs
-                                                                       , **rs
-                                                                       , max_features     = 'auto') ) 
+               # , ('Random Forest2'            , RandomForestClassifier(min_samples_leaf = 5
+               #                                                         , n_estimators     = 1000
+               #                                                         , bootstrap        = True
+               #                                                         , oob_score        = True
+               #                                                         , **njobs
+               #                                                         , **rs
+               #                                                         , max_features     = 'auto') ) 
                 , ('Ridge Classifier'          , RidgeClassifier(**rs)  )
                 , ('Ridge ClassifierCV'        , RidgeClassifierCV(cv = 3) )          
                 , ('SVC'                       , SVC(**rs) ) 
@@ -254,7 +285,7 @@ def MultModelsCl(input_df, target, skf_cv
         skf_cv_modD = cross_validate(model_pipeline
                               , input_df
                               , target
-                              , cv = skf_cv
+                              , cv = sel_cv
                               , scoring = scoring_fn
                               , return_train_score = True)
         #==============================
@@ -283,7 +314,7 @@ def MultModelsCl(input_df, target, skf_cv
            cmD = {}
 
             # Calculate cm         
-           y_pred   = cross_val_predict(model_pipeline, input_df, target, cv = skf_cv, **njobs)
+           y_pred   = cross_val_predict(model_pipeline, input_df, target, cv = sel_cv, **njobs)
             #_tn, _fp, _fn, _tp = confusion_matrix(y_pred, y).ravel() # internally
            tn, fp, fn, tp = confusion_matrix(y_pred, target).ravel()
     
@@ -334,8 +365,9 @@ def MultModelsCl(input_df, target, skf_cv
            
            bts_mcc_score = round(matthews_corrcoef(blind_test_target, bts_predict),2)
            print('\nMCC on Blind test:'     , bts_mcc_score)
-           print('\nAccuracy on Blind test:', round(accuracy_score(blind_test_target, bts_predict),2))
-           
+           #print('\nAccuracy on Blind test:', round(accuracy_score(blind_test_target, bts_predict),2))
+           print('\nMCC on Training:'      , mm_skf_scoresD[model_name]['test_mcc'] )
+
            mm_skf_scoresD[model_name]['bts_mcc']       = bts_mcc_score
            mm_skf_scoresD[model_name]['bts_fscore']    = round(f1_score(blind_test_target, bts_predict),2)
            mm_skf_scoresD[model_name]['bts_precision'] = round(precision_score(blind_test_target, bts_predict),2)
diff --git a/scripts/ml/ml_functions/ml_data_combined.py b/scripts/ml/ml_functions/ml_data_combined.py
index 57e2295..7dca351 100644
--- a/scripts/ml/ml_functions/ml_data_combined.py
+++ b/scripts/ml/ml_functions/ml_data_combined.py
@@ -26,7 +26,7 @@ from GetMLData import *
 combined_model_paramD = {'data_combined_model'   : True
                     , 'use_or'                   : False
                     , 'omit_all_genomic_features': False
-                    , 'write_maskfile'           : False
+                    , 'write_maskfile'           : False # true once for writing and checking
                     , 'write_outfile'            : False }
 
 pnca_df = getmldata('pncA', 'pyrazinamide' , **combined_model_paramD)
diff --git a/scripts/ml/ml_functions/test_func_singlegene.py b/scripts/ml/ml_functions/test_func_singlegene.py
index d483514..26a0095 100644
--- a/scripts/ml/ml_functions/test_func_singlegene.py
+++ b/scripts/ml/ml_functions/test_func_singlegene.py
@@ -14,7 +14,8 @@ sys.path
 # import
 from GetMLData import *
 from SplitTTS import *
-from MultClfs_fi import *
+#from MultClfs_fi import *
+from MultClfs import *
 
 #%%
 # X,y = load_boston(return_X_y=True) 
@@ -33,7 +34,7 @@ from MultClfs_fi import *
 
 #%%
 
-sel_cv = StratifiedKFold(n_splits = 10
+skf_cv = StratifiedKFold(n_splits = 10
                             , shuffle = True,**rs)
 #sel_cv = logo
 # sel_cv = RepeatedStratifiedKFold(n_splits = 5
@@ -48,10 +49,21 @@ gene_model_paramD = {'data_combined_model'   : False
 
 #df = getmldata(gene, drug, **gene_model_paramD)
 df = getmldata('pncA', 'pyrazinamide', **gene_model_paramD)
+df = getmldata('embB', 'ethambutol'   , **gene_model_paramD)
+df = getmldata('katG', 'isoniazid'    , **gene_model_paramD)
+df = getmldata('rpoB', 'rifampicin'   , **gene_model_paramD)
+df  = getmldata('gid' , 'streptomycin' , **gene_model_paramD)
+#df  = getmldata('alr' , 'cycloserine'  , **combined_model_paramD)
+all(df.columns.isin(['gene_name'])) # should be False
+
+
+spl_type = '70_30'
+spl_type = '80_20'
+spl_type = 'sl'
 
 df2 = split_tts(df
           , data_type = 'actual'
-          , split_type = '70_30'
+          , split_type = spl_type
           , oversampling = False
           , dst_colname = 'dst'
           , target_colname = 'dst_mode'
@@ -61,19 +73,43 @@ df2 = split_tts(df
 
 all(df2['X'].columns.isin(['gene_name'])) # should be False
 
-fooD = MultClfs_fi (input_df = df2['X']
+fooD = MultModelsCl(input_df = df2['X']
                 , target = df2['y']
-                , sel_cv = sel_cv
+                , sel_cv = skf_cv
                 , run_blind_test = True
                 , blind_test_df =  df2['X_bts']
                 , blind_test_target =  df2['y_bts']
-                , tts_split_type  = '70_30'
-                , var_type = 'mixed'
+                , tts_split_type  = spl_type
                 , resampling_type = 'none' # default
-)
+                , var_type = ['mixed']
+                , scale_numeric = ['min_max_neg']
+                , return_formatted_output = False
+
+                )
 
 for k, v in fooD.items():
     print('\nModel:', k
           , '\nTRAIN MCC:', fooD[k]['test_mcc']
           ,  '\nBTS MCC:' , fooD[k]['bts_mcc']
-          , '\nDIFF:',fooD[k]['bts_mcc'] - fooD[k]['test_mcc'] )
\ No newline at end of file
+          , '\nDIFF:',fooD[k]['bts_mcc'] - fooD[k]['test_mcc'] )
+    
+#%% CHECK SCALING
+embb_df = getmldata('embB', 'ethambutol'   , **combined_model_paramD)
+all(embb_df.columns.isin(['gene_name'])) # should be False
+
+scaler = MinMaxScaler(feature_range=(-1, 1))
+bar = embb_df[['vdwclashes_rr', 'electro_rr']]
+bar_df1 = scaler.fit_transform(bar)
+bar_df1 = pd.DataFrame(bar_df1)
+bar_df1.rename(columns = {0:'vdw_scaled', 1: 'ele_scaled'}, inplace = True)
+bar2 = pd.concat([bar, bar_df1], axis = 1)
+
+
+scaler2 = StandardScaler()
+baz = embb_df[['vdwclashes_rr', 'electro_rr']]
+baz_df1 = scaler2.fit_transform(baz)
+baz_df1 = pd.DataFrame(baz_df1)
+baz_df1.rename(columns = {0:'vdw_scaled', 1: 'ele_scaled'}, inplace = True)
+baz2 = pd.concat([baz, baz_df1], axis = 1)
+
+a = pd.concat([bar2, baz2], axis = 1)