added ml_functions dir

2022-06-29 12:06:47 +01:00 · 2022-06-29 12:06:47 +01:00 · 9aadb0329f
commit 9aadb0329f
parent c85c965c3e
30 changed files with 683 additions and 606160 deletions
--- a/scripts/ml/combined_model/ml_data_combined
+++ b/scripts/ml/combined_model/ml_data_combined
@ -1,73 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Sat Jun 25 11:07:30 2022
-
-@author: tanu
-"""
-
-import sys, os
-import pandas as pd
-import numpy as np
-import re
-
-###############################################################################
-homedir = os.path.expanduser("~")
-sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml/functions')
-###############################################################################
-#====================
-# Import ML functions 
-#====================
-#from MultClfs import *
-from GetMLData import *
-from SplitTTS import *
-#%% Load all gene files #######################################################
-# param dict
-combined_model_paramD = {'data_combined_model'   : True
-                    , 'use_or'                   : False
-                    , 'omit_all_genomic_features': False
-                    , 'write_maskfile'           : False
-                    , 'write_outfile'            : False }
-
-pnca_df = getmldata('pncA', 'pyrazinamide' , **combined_model_paramD)
-embb_df = getmldata('embB', 'ethambutol'   , **combined_model_paramD)
-katg_df = getmldata('katG', 'isoniazid'    , **combined_model_paramD)
-rpob_df = getmldata('rpoB', 'rifampicin'   , **combined_model_paramD)
-gid_df  = getmldata('gid' , 'streptomycin' , **combined_model_paramD)
-alr_df  = getmldata('alr' , 'cycloserine'  , **combined_model_paramD)
-
-# quick check
-foo = pd.concat([alr_df, pnca_df])
-check1 = foo.filter(regex= '.*_affinity|gene_name|ligand_distance', axis = 1)
-# So, pd.concat will join correctly but introduce NAs.
-# TODO: discuss whether to make these 0 and use it or just omit
-# For now I am omitting these i.e combining only on common columns
-
-expected_nrows  = len(pnca_df) + len(embb_df) + len(katg_df) + len(rpob_df) + len(gid_df) + len(alr_df)
-
-# finding common columns
-dfs_combine = [pnca_df, embb_df, katg_df, rpob_df, gid_df, alr_df]
-common_cols = list(set.intersection(*(set(df.columns) for df in dfs_combine)))
-expected_ncols = np.min([len(pnca_df.columns)] + [len(embb_df.columns)] + [len(katg_df.columns)] + [len(rpob_df.columns)] + [len(gid_df.columns)] + [len(alr_df.columns)])                                                                                                                              
-expected_ncols
-
-if len(common_cols) == expected_ncols:
-    print('\nProceeding to combine based on common cols (n):', len(common_cols))
-    combined_df = pd.concat([df[common_cols] for df in dfs_combine], ignore_index = False)
-    print('\nSuccessfully combined dfs:'
-          , '\nNo. of dfs combined:', len(dfs_combine)
-          , '\nDim of combined df:', combined_df.shape)
-else:
-    print('\nFAIL: could not combine dfs, length mismatch'
-          , '\nExpected ncols:', expected_ncols
-          , '\nGot:',  len(common_cols))
-#%% split data into different data types
-tts_7030_paramD = {'data_type'    : 'actual'
-              , 'split_type'      : '70_30'
-              , 'oversampling'    : True}
-                   
-data_CM_7030D = split_tts(ml_input_data = combined_df
-          , **tts_7030_paramD
-          , dst_colname = 'dst'
-          , target_colname = 'dst_mode'
-          , include_gene_name = False) # when not doing leave one group out        
--- a/scripts/ml/combined_model/untitled0.py
+++ b/scripts/ml/combined_model/untitled0.py
@ -1,221 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Sat Jun 25 11:07:30 2022
-
-@author: tanu
-"""
-
-import sys, os
-import pandas as pd
-import numpy as np
-import os, sys
-import pandas as pd
-import numpy as np
-print(np.__version__)
-print(pd.__version__)
-import pprint as pp
-from copy import deepcopy
-from collections import Counter
-from sklearn.impute import KNNImputer as KNN
-from imblearn.over_sampling import RandomOverSampler
-from imblearn.under_sampling import RandomUnderSampler
-from imblearn.over_sampling import SMOTE
-from sklearn.datasets import make_classification
-from imblearn.combine import SMOTEENN
-from imblearn.combine import SMOTETomek
-
-from imblearn.over_sampling import SMOTENC
-from imblearn.under_sampling import EditedNearestNeighbours
-from imblearn.under_sampling import RepeatedEditedNearestNeighbours
-
-from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score
-from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report
-
-from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
-from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold
-
-from sklearn.pipeline import Pipeline, make_pipeline
-import argparse
-import re
-homedir = os.path.expanduser("~")
-#%% Globals
-rs = {'random_state': 42}
-njobs = {'n_jobs': 10}
-#%% Define split_tts function #################################################
-def split_tts(ml_input_data
-              , data_type      = ['actual', 'complete', 'reverse']
-              , split_type     = ['70_30', '80_20', 'sl']
-              , oversampling   = True
-              , dst_colname    = 'dst'# determine how to subset the actual vs reverse data
-              , target_colname = 'dst_mode'):
-    
-    print('\nInput params:' 
-          , '\nDim of input df:'   , ml_input_data.shape
-          , '\nData type to split:', data_type
-          , '\nSplit type:'        , split_type
-          , '\ntarget colname:'    , target_colname)
-        
-    if oversampling:
-        print('\noversampling enabled')
-    else:
-        print('\nNot generating oversampled or undersampled data')
-
-    #====================================
-    # evaluating use_data_type
-    #====================================
-    if data_type == 'actual':
-        ml_data = ml_input_data[ml_input_data[dst_colname].notna()]
-    if data_type == 'complete':
-        ml_data = ml_input_data.copy()
-    if data_type == 'reverse':
-        ml_data = ml_input_data[ml_input_data[dst_colname].isna()]
-    #if_data_type == none
-        
-    #====================================
-    # separate features and target
-    #====================================
-    x_features = ml_data.drop([target_colname, dst_colname], axis = 1)
-    y_target   = ml_data[target_colname]
-        
-    # sanity check
-    if not 'dst_mode' in x_features.columns:
-        print('\nPASS: x_features has no target variable')
-        x_ncols = len(x_features.columns)
-        print('\nNo. of columns for x_features:', x_ncols)
-    else:
-        sys.exit('\nFAIL: x_features has target variable included. FIX it and rerun!')
-        
-    #====================================
-    # Train test split
-    # with stratification
-    #=====================================
-    if split_type == '70_30':
-        tts_test_size = 0.33
-    if split_type == '80_20':
-        tts_test_size = 0.2
-    if split_type == 'sl':
-        tts_test_size = 1/np.sqrt(x_ncols)
-        train_sl = 1 - tts_test_size
-    
-    #-------------------------
-    #  TTS split ~ split_type
-    #-------------------------
-    #x_train, x_test, y_train, y_test # traditional var_names
-    # so my downstream code doesn't need to change    
-    X, X_bts, y, y_bts = train_test_split(x_features, y_target
-                                                    , test_size = tts_test_size
-                                                    , **rs
-                                                    , stratify = y_target)
-    yc1 = Counter(y)
-    yc1_ratio = yc1[0]/yc1[1]
-    
-    yc2 = Counter(y_bts)
-    yc2_ratio = yc2[0]/yc2[1]
-    ###############################################################################
-    #======================================================
-    # Determine categorical and numerical features
-    #======================================================
-    numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
-    numerical_cols 
-    categorical_cols = X.select_dtypes(include=['object', 'bool']).columns
-    categorical_cols 
-    ###############################################################################
-    print('\n-------------------------------------------------------------'
-          , '\nSuccessfully generated training and test data:'
-          , '\nData used:' , data_type
-          , '\nSplit type:', split_type
-
-          , '\n\nTotal no. of input features:'      , len(X.columns)
-          , '\n--------No. of numerical features:'  , len(numerical_cols)
-          , '\n--------No. of categorical features:', len(categorical_cols)
-      
-          , '\n\nTotal data size:', len(X) + len(X_bts)
-    
-          , '\n\nTrain data size:', X.shape
-          , '\ny_train numbers:', yc1
-    
-          , '\n\nTest data size:', X_bts.shape
-          , '\ny_test_numbers:', yc2
-    
-          , '\n\ny_train ratio:',yc1_ratio
-          , '\ny_test ratio:', yc2_ratio
-          , '\n-------------------------------------------------------------'
-      )
-    
-    if oversampling:
-        
-        #######################################################################
-        #                               RESAMPLING
-        #######################################################################
-        #------------------------------
-        # Simple Random oversampling
-        # [Numerical + catgeorical]
-        #------------------------------
-        oversample = RandomOverSampler(sampling_strategy='minority')
-        X_ros, y_ros = oversample.fit_resample(X, y)
-        print('\nSimple Random OverSampling\n', Counter(y_ros))
-        print(X_ros.shape)
-        
-        #------------------------------
-        # Simple Random Undersampling
-        # [Numerical + catgeorical]
-        #------------------------------
-        undersample = RandomUnderSampler(sampling_strategy='majority')
-        X_rus, y_rus = undersample.fit_resample(X, y)
-        print('\nSimple Random UnderSampling\n', Counter(y_rus))
-        print(X_rus.shape)
-        
-        #------------------------------
-        # Simple combine ROS and RUS
-        # [Numerical + catgeorical]
-        #------------------------------
-        oversample = RandomOverSampler(sampling_strategy='minority')
-        X_ros, y_ros = oversample.fit_resample(X, y)
-        undersample = RandomUnderSampler(sampling_strategy='majority')
-        X_rouC, y_rouC = undersample.fit_resample(X_ros, y_ros)
-        print('\nSimple Combined Over and UnderSampling\n',  Counter(y_rouC))
-        print(X_rouC.shape)
-        
-        #------------------------------
-        # SMOTE_NC: oversampling 
-        # [numerical + categorical]
-        #https://stackoverflow.com/questions/47655813/oversampling-smote-for-binary-and-categorical-data-in-python
-        #------------------------------
-        # Determine categorical and numerical features
-        numerical_ix = X.select_dtypes(include=['int64', 'float64']).columns
-        numerical_ix
-        num_featuresL = list(numerical_ix)
-        numerical_colind = X.columns.get_indexer(list(numerical_ix) )
-        numerical_colind
-        
-        categorical_ix = X.select_dtypes(include=['object', 'bool']).columns
-        categorical_ix    
-        categorical_colind = X.columns.get_indexer(list(categorical_ix))
-        categorical_colind
-        
-        k_sm = 5 # default
-        sm_nc = SMOTENC(categorical_features=categorical_colind, k_neighbors = k_sm, **rs, **njobs)
-        X_smnc, y_smnc = sm_nc.fit_resample(X, y)
-        print('\nSMOTE_NC OverSampling\n', Counter(y_smnc))
-        print(X_smnc.shape)
-        
-        print('\nGenerated resampled data as below:'
-            , '\n==========================='
-            , '\nRandom oversampling:'
-            , '\n==========================='
-             
-            , '\n\nTrain data size:', X_ros.shape
-     
-            , '\ny_train numbers:', y_ros
-            , '\n\ny_train ratio:', Counter(y_ros)[0]/Counter(y_ros)[0]
-            
-            , '\ny_test ratio:' , yc2_ratio
-            
-          , '\n-------------------------------------------------------------'
-      )
-        
-        
-     #   globals().update(locals()) # TROLOLOLOLOLOLS
-        
-    #return()