horrible lineage analysis hell

2022-06-28 21:51:02 +01:00 · 2022-06-28 21:51:02 +01:00 · 478df927cc
commit 478df927cc
parent ce0f12382e
10 changed files with 1669 additions and 101 deletions
--- a/scripts/ml/combined_model/untitled0.py
+++ b/scripts/ml/combined_model/untitled0.py
@ -0,0 +1,221 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Sat Jun 25 11:07:30 2022
+
+@author: tanu
+"""
+
+import sys, os
+import pandas as pd
+import numpy as np
+import os, sys
+import pandas as pd
+import numpy as np
+print(np.__version__)
+print(pd.__version__)
+import pprint as pp
+from copy import deepcopy
+from collections import Counter
+from sklearn.impute import KNNImputer as KNN
+from imblearn.over_sampling import RandomOverSampler
+from imblearn.under_sampling import RandomUnderSampler
+from imblearn.over_sampling import SMOTE
+from sklearn.datasets import make_classification
+from imblearn.combine import SMOTEENN
+from imblearn.combine import SMOTETomek
+
+from imblearn.over_sampling import SMOTENC
+from imblearn.under_sampling import EditedNearestNeighbours
+from imblearn.under_sampling import RepeatedEditedNearestNeighbours
+
+from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score
+from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report
+
+from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
+from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold
+
+from sklearn.pipeline import Pipeline, make_pipeline
+import argparse
+import re
+homedir = os.path.expanduser("~")
+#%% Globals
+rs = {'random_state': 42}
+njobs = {'n_jobs': 10}
+#%% Define split_tts function #################################################
+def split_tts(ml_input_data
+              , data_type      = ['actual', 'complete', 'reverse']
+              , split_type     = ['70_30', '80_20', 'sl']
+              , oversampling   = True
+              , dst_colname    = 'dst'# determine how to subset the actual vs reverse data
+              , target_colname = 'dst_mode'):
+    
+    print('\nInput params:' 
+          , '\nDim of input df:'   , ml_input_data.shape
+          , '\nData type to split:', data_type
+          , '\nSplit type:'        , split_type
+          , '\ntarget colname:'    , target_colname)
+        
+    if oversampling:
+        print('\noversampling enabled')
+    else:
+        print('\nNot generating oversampled or undersampled data')
+
+    #====================================
+    # evaluating use_data_type
+    #====================================
+    if data_type == 'actual':
+        ml_data = ml_input_data[ml_input_data[dst_colname].notna()]
+    if data_type == 'complete':
+        ml_data = ml_input_data.copy()
+    if data_type == 'reverse':
+        ml_data = ml_input_data[ml_input_data[dst_colname].isna()]
+    #if_data_type == none
+        
+    #====================================
+    # separate features and target
+    #====================================
+    x_features = ml_data.drop([target_colname, dst_colname], axis = 1)
+    y_target   = ml_data[target_colname]
+        
+    # sanity check
+    if not 'dst_mode' in x_features.columns:
+        print('\nPASS: x_features has no target variable')
+        x_ncols = len(x_features.columns)
+        print('\nNo. of columns for x_features:', x_ncols)
+    else:
+        sys.exit('\nFAIL: x_features has target variable included. FIX it and rerun!')
+        
+    #====================================
+    # Train test split
+    # with stratification
+    #=====================================
+    if split_type == '70_30':
+        tts_test_size = 0.33
+    if split_type == '80_20':
+        tts_test_size = 0.2
+    if split_type == 'sl':
+        tts_test_size = 1/np.sqrt(x_ncols)
+        train_sl = 1 - tts_test_size
+    
+    #-------------------------
+    #  TTS split ~ split_type
+    #-------------------------
+    #x_train, x_test, y_train, y_test # traditional var_names
+    # so my downstream code doesn't need to change    
+    X, X_bts, y, y_bts = train_test_split(x_features, y_target
+                                                    , test_size = tts_test_size
+                                                    , **rs
+                                                    , stratify = y_target)
+    yc1 = Counter(y)
+    yc1_ratio = yc1[0]/yc1[1]
+    
+    yc2 = Counter(y_bts)
+    yc2_ratio = yc2[0]/yc2[1]
+    ###############################################################################
+    #======================================================
+    # Determine categorical and numerical features
+    #======================================================
+    numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
+    numerical_cols 
+    categorical_cols = X.select_dtypes(include=['object', 'bool']).columns
+    categorical_cols 
+    ###############################################################################
+    print('\n-------------------------------------------------------------'
+          , '\nSuccessfully generated training and test data:'
+          , '\nData used:' , data_type
+          , '\nSplit type:', split_type
+
+          , '\n\nTotal no. of input features:'      , len(X.columns)
+          , '\n--------No. of numerical features:'  , len(numerical_cols)
+          , '\n--------No. of categorical features:', len(categorical_cols)
+      
+          , '\n\nTotal data size:', len(X) + len(X_bts)
+    
+          , '\n\nTrain data size:', X.shape
+          , '\ny_train numbers:', yc1
+    
+          , '\n\nTest data size:', X_bts.shape
+          , '\ny_test_numbers:', yc2
+    
+          , '\n\ny_train ratio:',yc1_ratio
+          , '\ny_test ratio:', yc2_ratio
+          , '\n-------------------------------------------------------------'
+      )
+    
+    if oversampling:
+        
+        #######################################################################
+        #                               RESAMPLING
+        #######################################################################
+        #------------------------------
+        # Simple Random oversampling
+        # [Numerical + catgeorical]
+        #------------------------------
+        oversample = RandomOverSampler(sampling_strategy='minority')
+        X_ros, y_ros = oversample.fit_resample(X, y)
+        print('\nSimple Random OverSampling\n', Counter(y_ros))
+        print(X_ros.shape)
+        
+        #------------------------------
+        # Simple Random Undersampling
+        # [Numerical + catgeorical]
+        #------------------------------
+        undersample = RandomUnderSampler(sampling_strategy='majority')
+        X_rus, y_rus = undersample.fit_resample(X, y)
+        print('\nSimple Random UnderSampling\n', Counter(y_rus))
+        print(X_rus.shape)
+        
+        #------------------------------
+        # Simple combine ROS and RUS
+        # [Numerical + catgeorical]
+        #------------------------------
+        oversample = RandomOverSampler(sampling_strategy='minority')
+        X_ros, y_ros = oversample.fit_resample(X, y)
+        undersample = RandomUnderSampler(sampling_strategy='majority')
+        X_rouC, y_rouC = undersample.fit_resample(X_ros, y_ros)
+        print('\nSimple Combined Over and UnderSampling\n',  Counter(y_rouC))
+        print(X_rouC.shape)
+        
+        #------------------------------
+        # SMOTE_NC: oversampling 
+        # [numerical + categorical]
+        #https://stackoverflow.com/questions/47655813/oversampling-smote-for-binary-and-categorical-data-in-python
+        #------------------------------
+        # Determine categorical and numerical features
+        numerical_ix = X.select_dtypes(include=['int64', 'float64']).columns
+        numerical_ix
+        num_featuresL = list(numerical_ix)
+        numerical_colind = X.columns.get_indexer(list(numerical_ix) )
+        numerical_colind
+        
+        categorical_ix = X.select_dtypes(include=['object', 'bool']).columns
+        categorical_ix    
+        categorical_colind = X.columns.get_indexer(list(categorical_ix))
+        categorical_colind
+        
+        k_sm = 5 # default
+        sm_nc = SMOTENC(categorical_features=categorical_colind, k_neighbors = k_sm, **rs, **njobs)
+        X_smnc, y_smnc = sm_nc.fit_resample(X, y)
+        print('\nSMOTE_NC OverSampling\n', Counter(y_smnc))
+        print(X_smnc.shape)
+        
+        print('\nGenerated resampled data as below:'
+            , '\n==========================='
+            , '\nRandom oversampling:'
+            , '\n==========================='
+             
+            , '\n\nTrain data size:', X_ros.shape
+     
+            , '\ny_train numbers:', y_ros
+            , '\n\ny_train ratio:', Counter(y_ros)[0]/Counter(y_ros)[0]
+            
+            , '\ny_test ratio:' , yc2_ratio
+            
+          , '\n-------------------------------------------------------------'
+      )
+        
+        
+     #   globals().update(locals()) # TROLOLOLOLOLOLS
+        
+    #return()