added ml_functions dir

2022-06-29 12:06:47 +01:00 · 2022-06-29 12:06:47 +01:00 · 9aadb0329f
commit 9aadb0329f
parent c85c965c3e
30 changed files with 683 additions and 606160 deletions
--- a/scripts/ml/ml_functions/SplitTTS.py
+++ b/scripts/ml/ml_functions/SplitTTS.py
@ -0,0 +1,287 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Sat Jun 25 11:07:30 2022
+
+@author: tanu
+"""
+
+import sys, os
+import pandas as pd
+import numpy as np
+import os, sys
+import pandas as pd
+import numpy as np
+print(np.__version__)
+print(pd.__version__)
+import pprint as pp
+from copy import deepcopy
+from collections import Counter
+from sklearn.impute import KNNImputer as KNN
+from imblearn.over_sampling import RandomOverSampler
+from imblearn.under_sampling import RandomUnderSampler
+from imblearn.over_sampling import SMOTE
+from sklearn.datasets import make_classification
+from imblearn.combine import SMOTEENN
+from imblearn.combine import SMOTETomek
+
+from imblearn.over_sampling import SMOTENC
+from imblearn.under_sampling import EditedNearestNeighbours
+from imblearn.under_sampling import RepeatedEditedNearestNeighbours
+
+from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score
+from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report
+
+from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
+from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold
+
+from sklearn.pipeline import Pipeline, make_pipeline
+import argparse
+import re
+homedir = os.path.expanduser("~")
+#%% GLOBALS
+rs = {'random_state': 42}
+njobs = {'n_jobs': 10}
+
+#%% Define split_tts function #################################################
+def split_tts(ml_input_data
+              , data_type      = ['actual', 'complete']
+              , split_type     = ['70_30', '80_20', 'sl']
+              , oversampling   = True
+              , dst_colname    = 'dst'# determine how to subset the actual vs reverse data
+              , target_colname = 'dst_mode'
+              , include_gene_name = True
+              , k_smote = 5):
+    
+    outDict = {}
+    
+    print('\nInput params:' 
+          , '\nDim of input df:'   , ml_input_data.shape
+          , '\nData type to split:', data_type
+          , '\nSplit type:'        , split_type
+          , '\ntarget colname:'    , target_colname)
+        
+    if oversampling:
+        print('\noversampling enabled')
+    else:
+        print('\nNot generating oversampled or undersampled data')
+    
+    if include_gene_name:
+        cols_to_dropL = []
+    else:
+        cols_to_dropL = ['gene_name']
+
+    #====================================
+    # evaluating use_data_type
+    #====================================
+    if data_type == 'actual':
+        ml_data = ml_input_data[ml_input_data[dst_colname].notna()]
+    if data_type == 'complete':
+        ml_data = ml_input_data.copy()
+        
+    #====================================
+    # separate features and target
+    #====================================
+    cols_to_dropL = cols_to_dropL + [target_colname, dst_colname]
+    x_features    = ml_data.drop(cols_to_dropL, axis = 1)
+    y_target      = ml_data[target_colname]
+        
+    # sanity check
+    check1 = x_features[[i for i in cols_to_dropL if i in x_features.columns]]
+    
+    #if not 'dst_mode' in x_features.columns:
+    if check1.empty:
+        print('\nPASS: x_features has no target variable and no dst column'
+              , '\nDropped cols:', len(cols_to_dropL)
+              , '\nThese were:', target_colname,'and', dst_colname)
+        x_ncols = len(x_features.columns)
+        print('\nNo. of cols in input df:', len(ml_input_data.columns)
+              , '\nNo.of cols dropped:', len(cols_to_dropL)
+              , '\nNo. of columns for x_features:', x_ncols)
+    else:
+        sys.exit('\nFAIL: x_features has target variable included. FIX it and rerun!')
+        
+    #====================================
+    # Train test split
+    # with stratification
+    #=====================================
+    if split_type == '70_30':
+        tts_test_size = 0.33
+    if split_type == '80_20':
+        tts_test_size = 0.2
+    if split_type == 'sl':
+        tts_test_size = 1/np.sqrt(x_ncols)
+        train_sl = 1 - tts_test_size
+    
+    #-------------------------
+    #  TTS split ~ split_type
+    #-------------------------
+    #x_train, x_test, y_train, y_test # traditional var_names
+    # so my downstream code doesn't need to change    
+    X, X_bts, y, y_bts = train_test_split(x_features, y_target
+                                                    , test_size = tts_test_size
+                                                    , **rs
+                                                    , stratify = y_target)
+    yc1 = Counter(y)
+    yc1_ratio = yc1[0]/yc1[1]
+    
+    yc2 = Counter(y_bts)
+    yc2_ratio = yc2[0]/yc2[1]
+    ###############################################################################
+    #======================================================
+    # Determine categorical and numerical features
+    #======================================================
+    numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
+    numerical_cols 
+    categorical_cols = X.select_dtypes(include=['object', 'bool']).columns
+    categorical_cols 
+    ###############################################################################
+    print('\n-------------------------------------------------------------'
+          , '\nSuccessfully generated training and test data:'
+          , '\nData used:' , data_type
+          , '\nSplit type:', split_type
+
+          , '\n\nTotal no. of input features:'      , len(X.columns)
+          , '\n--------No. of numerical features:'  , len(numerical_cols)
+          , '\n--------No. of categorical features:', len(categorical_cols)
+          
+          , '\n==========================='
+          , '\n Resampling: NONE'
+          , '\nBaseline'
+          , '\n==========================='
+          
+          , '\n\nTotal data size:', len(X) + len(X_bts)
+    
+          , '\n\nTrain data size:', X.shape
+          , '\ny_train numbers:', yc1
+    
+          , '\n\nTest data size:', X_bts.shape
+          , '\ny_test_numbers:', yc2
+    
+          , '\n\ny_train ratio:',yc1_ratio
+          , '\ny_test ratio:', yc2_ratio
+          , '\n-------------------------------------------------------------')
+    
+    outDict.update({'X'       : X
+            , 'X_bts' : X_bts
+            , 'y'     : y
+            , 'y_bts' : y_bts
+            } ) 
+    
+    if oversampling:
+        #######################################################################
+        #                               RESAMPLING
+        #######################################################################
+        #------------------------------
+        # Simple Random oversampling
+        # [Numerical + catgeorical]
+        #------------------------------
+        oversample = RandomOverSampler(sampling_strategy='minority')
+        X_ros, y_ros = oversample.fit_resample(X, y)
+        print('\nSimple Random OverSampling\n', Counter(y_ros))
+        print(X_ros.shape)
+        
+        #------------------------------
+        # Simple Random Undersampling
+        # [Numerical + catgeorical]
+        #------------------------------
+        undersample = RandomUnderSampler(sampling_strategy='majority')
+        X_rus, y_rus = undersample.fit_resample(X, y)
+        print('\nSimple Random UnderSampling\n', Counter(y_rus))
+        print(X_rus.shape)
+        
+        #------------------------------
+        # Simple combine ROS and RUS
+        # [Numerical + catgeorical]
+        #------------------------------
+        oversample = RandomOverSampler(sampling_strategy='minority')
+        X_ros, y_ros = oversample.fit_resample(X, y)
+        undersample = RandomUnderSampler(sampling_strategy='majority')
+        X_rouC, y_rouC = undersample.fit_resample(X_ros, y_ros)
+        print('\nSimple Combined Over and UnderSampling\n',  Counter(y_rouC))
+        print(X_rouC.shape)
+        
+        #------------------------------
+        # SMOTE_NC: oversampling 
+        # [numerical + categorical]
+        #https://stackoverflow.com/questions/47655813/oversampling-smote-for-binary-and-categorical-data-in-python
+        #------------------------------
+        # Determine categorical and numerical features
+        numerical_ix = X.select_dtypes(include=['int64', 'float64']).columns
+        numerical_ix
+        num_featuresL = list(numerical_ix)
+        numerical_colind = X.columns.get_indexer(list(numerical_ix) )
+        numerical_colind
+        
+        categorical_ix = X.select_dtypes(include=['object', 'bool']).columns
+        categorical_ix    
+        categorical_colind = X.columns.get_indexer(list(categorical_ix))
+        categorical_colind
+        
+        #k_sm = 5 # default
+        k_sm = k_smote
+        sm_nc = SMOTENC(categorical_features=categorical_colind, k_neighbors = k_sm, **rs, **njobs)
+        X_smnc, y_smnc = sm_nc.fit_resample(X, y)
+        print('\nSMOTE_NC OverSampling\n', Counter(y_smnc))
+        print(X_smnc.shape)
+        
+        print('\nGenerated Resampled data as below:'
+            , '\n================================='
+            , '\nResampling: Random oversampling'
+            , '\n================================'
+             
+            , '\n\nTrain data size:', X_ros.shape
+            , '\ny_train numbers:', len(y_ros)
+            , '\n\ny_train ratio:', Counter(y_ros)[0]/Counter(y_ros)[1]
+            
+            , '\ny_test ratio:' , yc2_ratio
+            ##################################################################
+            , '\n================================'
+            , '\nResampling: Random underampling'
+            , '\n================================'
+            
+            , '\n\nTrain data size:', X_rus.shape
+            , '\ny_train numbers:', len(y_rus)
+            , '\n\ny_train ratio:', Counter(y_rus)[0]/Counter(y_rus)[1]
+            
+            , '\ny_test ratio:' , yc2_ratio
+            ##################################################################
+            , '\n================================'
+            , '\nResampling:Combined (over+under)'
+            , '\n================================'
+                                    
+            , '\n\nTrain data size:', X_rouC.shape
+            , '\ny_train numbers:', len(y_rouC)
+            , '\n\ny_train ratio:', Counter(y_rouC)[0]/Counter(y_rouC)[1]
+            
+            , '\ny_test ratio:' , yc2_ratio
+            ##################################################################            
+            , '\n=============================='
+            , '\nResampling: Smote NC'
+            , '\n=============================='
+            
+            , '\n\nTrain data size:', X_smnc.shape
+            , '\ny_train numbers:', len(y_smnc)
+            , '\n\ny_train ratio:', Counter(y_smnc)[0]/Counter(y_smnc)[1]
+            
+            , '\ny_test ratio:' , yc2_ratio
+            ##################################################################
+           , '\n-------------------------------------------------------------')
+
+        outDict.update({'X_ros'   : X_ros
+                        , 'y_ros' : y_ros
+                            
+                        , 'X_rus' : X_rus
+                        , 'y_rus' : y_rus
+                            
+                        , 'X_rouC': X_rouC
+                        , 'y_rouC': y_rouC
+                            
+                        , 'X_smnc': X_smnc
+                        , 'y_smnc': y_smnc})
+        return(outDict)
+        
+     #   globals().update(locals()) # TROLOLOLOLOLOLS
+     
+    else:
+        return(outDict)