added ml_functions dir
This commit is contained in:
parent
c85c965c3e
commit
9aadb0329f
30 changed files with 683 additions and 606160 deletions
|
@ -1,73 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sat Jun 25 11:07:30 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
|
||||
import sys, os
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import re
|
||||
|
||||
###############################################################################
|
||||
homedir = os.path.expanduser("~")
|
||||
sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml/functions')
|
||||
###############################################################################
|
||||
#====================
|
||||
# Import ML functions
|
||||
#====================
|
||||
#from MultClfs import *
|
||||
from GetMLData import *
|
||||
from SplitTTS import *
|
||||
#%% Load all gene files #######################################################
|
||||
# param dict
|
||||
combined_model_paramD = {'data_combined_model' : True
|
||||
, 'use_or' : False
|
||||
, 'omit_all_genomic_features': False
|
||||
, 'write_maskfile' : False
|
||||
, 'write_outfile' : False }
|
||||
|
||||
pnca_df = getmldata('pncA', 'pyrazinamide' , **combined_model_paramD)
|
||||
embb_df = getmldata('embB', 'ethambutol' , **combined_model_paramD)
|
||||
katg_df = getmldata('katG', 'isoniazid' , **combined_model_paramD)
|
||||
rpob_df = getmldata('rpoB', 'rifampicin' , **combined_model_paramD)
|
||||
gid_df = getmldata('gid' , 'streptomycin' , **combined_model_paramD)
|
||||
alr_df = getmldata('alr' , 'cycloserine' , **combined_model_paramD)
|
||||
|
||||
# quick check
|
||||
foo = pd.concat([alr_df, pnca_df])
|
||||
check1 = foo.filter(regex= '.*_affinity|gene_name|ligand_distance', axis = 1)
|
||||
# So, pd.concat will join correctly but introduce NAs.
|
||||
# TODO: discuss whether to make these 0 and use it or just omit
|
||||
# For now I am omitting these i.e combining only on common columns
|
||||
|
||||
expected_nrows = len(pnca_df) + len(embb_df) + len(katg_df) + len(rpob_df) + len(gid_df) + len(alr_df)
|
||||
|
||||
# finding common columns
|
||||
dfs_combine = [pnca_df, embb_df, katg_df, rpob_df, gid_df, alr_df]
|
||||
common_cols = list(set.intersection(*(set(df.columns) for df in dfs_combine)))
|
||||
expected_ncols = np.min([len(pnca_df.columns)] + [len(embb_df.columns)] + [len(katg_df.columns)] + [len(rpob_df.columns)] + [len(gid_df.columns)] + [len(alr_df.columns)])
|
||||
expected_ncols
|
||||
|
||||
if len(common_cols) == expected_ncols:
|
||||
print('\nProceeding to combine based on common cols (n):', len(common_cols))
|
||||
combined_df = pd.concat([df[common_cols] for df in dfs_combine], ignore_index = False)
|
||||
print('\nSuccessfully combined dfs:'
|
||||
, '\nNo. of dfs combined:', len(dfs_combine)
|
||||
, '\nDim of combined df:', combined_df.shape)
|
||||
else:
|
||||
print('\nFAIL: could not combine dfs, length mismatch'
|
||||
, '\nExpected ncols:', expected_ncols
|
||||
, '\nGot:', len(common_cols))
|
||||
#%% split data into different data types
|
||||
tts_7030_paramD = {'data_type' : 'actual'
|
||||
, 'split_type' : '70_30'
|
||||
, 'oversampling' : True}
|
||||
|
||||
data_CM_7030D = split_tts(ml_input_data = combined_df
|
||||
, **tts_7030_paramD
|
||||
, dst_colname = 'dst'
|
||||
, target_colname = 'dst_mode'
|
||||
, include_gene_name = False) # when not doing leave one group out
|
|
@ -1,221 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sat Jun 25 11:07:30 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
|
||||
import sys, os
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import os, sys
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
print(np.__version__)
|
||||
print(pd.__version__)
|
||||
import pprint as pp
|
||||
from copy import deepcopy
|
||||
from collections import Counter
|
||||
from sklearn.impute import KNNImputer as KNN
|
||||
from imblearn.over_sampling import RandomOverSampler
|
||||
from imblearn.under_sampling import RandomUnderSampler
|
||||
from imblearn.over_sampling import SMOTE
|
||||
from sklearn.datasets import make_classification
|
||||
from imblearn.combine import SMOTEENN
|
||||
from imblearn.combine import SMOTETomek
|
||||
|
||||
from imblearn.over_sampling import SMOTENC
|
||||
from imblearn.under_sampling import EditedNearestNeighbours
|
||||
from imblearn.under_sampling import RepeatedEditedNearestNeighbours
|
||||
|
||||
from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score
|
||||
from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report
|
||||
|
||||
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
|
||||
from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold
|
||||
|
||||
from sklearn.pipeline import Pipeline, make_pipeline
|
||||
import argparse
|
||||
import re
|
||||
homedir = os.path.expanduser("~")
|
||||
#%% Globals
|
||||
rs = {'random_state': 42}
|
||||
njobs = {'n_jobs': 10}
|
||||
#%% Define split_tts function #################################################
|
||||
def split_tts(ml_input_data
|
||||
, data_type = ['actual', 'complete', 'reverse']
|
||||
, split_type = ['70_30', '80_20', 'sl']
|
||||
, oversampling = True
|
||||
, dst_colname = 'dst'# determine how to subset the actual vs reverse data
|
||||
, target_colname = 'dst_mode'):
|
||||
|
||||
print('\nInput params:'
|
||||
, '\nDim of input df:' , ml_input_data.shape
|
||||
, '\nData type to split:', data_type
|
||||
, '\nSplit type:' , split_type
|
||||
, '\ntarget colname:' , target_colname)
|
||||
|
||||
if oversampling:
|
||||
print('\noversampling enabled')
|
||||
else:
|
||||
print('\nNot generating oversampled or undersampled data')
|
||||
|
||||
#====================================
|
||||
# evaluating use_data_type
|
||||
#====================================
|
||||
if data_type == 'actual':
|
||||
ml_data = ml_input_data[ml_input_data[dst_colname].notna()]
|
||||
if data_type == 'complete':
|
||||
ml_data = ml_input_data.copy()
|
||||
if data_type == 'reverse':
|
||||
ml_data = ml_input_data[ml_input_data[dst_colname].isna()]
|
||||
#if_data_type == none
|
||||
|
||||
#====================================
|
||||
# separate features and target
|
||||
#====================================
|
||||
x_features = ml_data.drop([target_colname, dst_colname], axis = 1)
|
||||
y_target = ml_data[target_colname]
|
||||
|
||||
# sanity check
|
||||
if not 'dst_mode' in x_features.columns:
|
||||
print('\nPASS: x_features has no target variable')
|
||||
x_ncols = len(x_features.columns)
|
||||
print('\nNo. of columns for x_features:', x_ncols)
|
||||
else:
|
||||
sys.exit('\nFAIL: x_features has target variable included. FIX it and rerun!')
|
||||
|
||||
#====================================
|
||||
# Train test split
|
||||
# with stratification
|
||||
#=====================================
|
||||
if split_type == '70_30':
|
||||
tts_test_size = 0.33
|
||||
if split_type == '80_20':
|
||||
tts_test_size = 0.2
|
||||
if split_type == 'sl':
|
||||
tts_test_size = 1/np.sqrt(x_ncols)
|
||||
train_sl = 1 - tts_test_size
|
||||
|
||||
#-------------------------
|
||||
# TTS split ~ split_type
|
||||
#-------------------------
|
||||
#x_train, x_test, y_train, y_test # traditional var_names
|
||||
# so my downstream code doesn't need to change
|
||||
X, X_bts, y, y_bts = train_test_split(x_features, y_target
|
||||
, test_size = tts_test_size
|
||||
, **rs
|
||||
, stratify = y_target)
|
||||
yc1 = Counter(y)
|
||||
yc1_ratio = yc1[0]/yc1[1]
|
||||
|
||||
yc2 = Counter(y_bts)
|
||||
yc2_ratio = yc2[0]/yc2[1]
|
||||
###############################################################################
|
||||
#======================================================
|
||||
# Determine categorical and numerical features
|
||||
#======================================================
|
||||
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
|
||||
numerical_cols
|
||||
categorical_cols = X.select_dtypes(include=['object', 'bool']).columns
|
||||
categorical_cols
|
||||
###############################################################################
|
||||
print('\n-------------------------------------------------------------'
|
||||
, '\nSuccessfully generated training and test data:'
|
||||
, '\nData used:' , data_type
|
||||
, '\nSplit type:', split_type
|
||||
|
||||
, '\n\nTotal no. of input features:' , len(X.columns)
|
||||
, '\n--------No. of numerical features:' , len(numerical_cols)
|
||||
, '\n--------No. of categorical features:', len(categorical_cols)
|
||||
|
||||
, '\n\nTotal data size:', len(X) + len(X_bts)
|
||||
|
||||
, '\n\nTrain data size:', X.shape
|
||||
, '\ny_train numbers:', yc1
|
||||
|
||||
, '\n\nTest data size:', X_bts.shape
|
||||
, '\ny_test_numbers:', yc2
|
||||
|
||||
, '\n\ny_train ratio:',yc1_ratio
|
||||
, '\ny_test ratio:', yc2_ratio
|
||||
, '\n-------------------------------------------------------------'
|
||||
)
|
||||
|
||||
if oversampling:
|
||||
|
||||
#######################################################################
|
||||
# RESAMPLING
|
||||
#######################################################################
|
||||
#------------------------------
|
||||
# Simple Random oversampling
|
||||
# [Numerical + catgeorical]
|
||||
#------------------------------
|
||||
oversample = RandomOverSampler(sampling_strategy='minority')
|
||||
X_ros, y_ros = oversample.fit_resample(X, y)
|
||||
print('\nSimple Random OverSampling\n', Counter(y_ros))
|
||||
print(X_ros.shape)
|
||||
|
||||
#------------------------------
|
||||
# Simple Random Undersampling
|
||||
# [Numerical + catgeorical]
|
||||
#------------------------------
|
||||
undersample = RandomUnderSampler(sampling_strategy='majority')
|
||||
X_rus, y_rus = undersample.fit_resample(X, y)
|
||||
print('\nSimple Random UnderSampling\n', Counter(y_rus))
|
||||
print(X_rus.shape)
|
||||
|
||||
#------------------------------
|
||||
# Simple combine ROS and RUS
|
||||
# [Numerical + catgeorical]
|
||||
#------------------------------
|
||||
oversample = RandomOverSampler(sampling_strategy='minority')
|
||||
X_ros, y_ros = oversample.fit_resample(X, y)
|
||||
undersample = RandomUnderSampler(sampling_strategy='majority')
|
||||
X_rouC, y_rouC = undersample.fit_resample(X_ros, y_ros)
|
||||
print('\nSimple Combined Over and UnderSampling\n', Counter(y_rouC))
|
||||
print(X_rouC.shape)
|
||||
|
||||
#------------------------------
|
||||
# SMOTE_NC: oversampling
|
||||
# [numerical + categorical]
|
||||
#https://stackoverflow.com/questions/47655813/oversampling-smote-for-binary-and-categorical-data-in-python
|
||||
#------------------------------
|
||||
# Determine categorical and numerical features
|
||||
numerical_ix = X.select_dtypes(include=['int64', 'float64']).columns
|
||||
numerical_ix
|
||||
num_featuresL = list(numerical_ix)
|
||||
numerical_colind = X.columns.get_indexer(list(numerical_ix) )
|
||||
numerical_colind
|
||||
|
||||
categorical_ix = X.select_dtypes(include=['object', 'bool']).columns
|
||||
categorical_ix
|
||||
categorical_colind = X.columns.get_indexer(list(categorical_ix))
|
||||
categorical_colind
|
||||
|
||||
k_sm = 5 # default
|
||||
sm_nc = SMOTENC(categorical_features=categorical_colind, k_neighbors = k_sm, **rs, **njobs)
|
||||
X_smnc, y_smnc = sm_nc.fit_resample(X, y)
|
||||
print('\nSMOTE_NC OverSampling\n', Counter(y_smnc))
|
||||
print(X_smnc.shape)
|
||||
|
||||
print('\nGenerated resampled data as below:'
|
||||
, '\n==========================='
|
||||
, '\nRandom oversampling:'
|
||||
, '\n==========================='
|
||||
|
||||
, '\n\nTrain data size:', X_ros.shape
|
||||
|
||||
, '\ny_train numbers:', y_ros
|
||||
, '\n\ny_train ratio:', Counter(y_ros)[0]/Counter(y_ros)[0]
|
||||
|
||||
, '\ny_test ratio:' , yc2_ratio
|
||||
|
||||
, '\n-------------------------------------------------------------'
|
||||
)
|
||||
|
||||
|
||||
# globals().update(locals()) # TROLOLOLOLOLOLS
|
||||
|
||||
#return()
|
Loading…
Add table
Add a link
Reference in a new issue