diff --git a/MultModelsCl_CALL.py b/MultModelsCl_CALL.py index c285f0a..9c363bb 100644 --- a/MultModelsCl_CALL.py +++ b/MultModelsCl_CALL.py @@ -6,6 +6,38 @@ Created on Tue Mar 15 11:09:50 2022 @author: tanu """ +from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score +from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report + +from sklearn.model_selection import train_test_split, cross_validate, cross_val_score +from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold + +from sklearn.pipeline import Pipeline, make_pipeline +#%% GLOBALS +rs = {'random_state': 42} +njobs = {'n_jobs': 10} + +scoring_fn = ({'accuracy' : make_scorer(accuracy_score) + , 'fscore' : make_scorer(f1_score) + , 'mcc' : make_scorer(matthews_corrcoef) + , 'precision' : make_scorer(precision_score) + , 'recall' : make_scorer(recall_score) + , 'roc_auc' : make_scorer(roc_auc_score) + , 'jcc' : make_scorer(jaccard_score) + }) + +skf_cv = StratifiedKFold(n_splits = 10 + #, shuffle = False, random_state= None) + , shuffle = True,**rs) + +rskf_cv = RepeatedStratifiedKFold(n_splits = 10 + , n_repeats = 3 + , **rs) + +mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)} +jacc_score_fn = {'jcc': make_scorer(jaccard_score)} + +############################################################################### #%% MultModelsCl: function call() mm_skf_scoresD = MultModelsCl(input_df = X , target = y @@ -22,35 +54,7 @@ baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) baseline_BT = baseline_all.filter(like='bts_', axis=1) baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) -#%% SMOTE OS: Numerical only -# mm_skf_scoresD2 = MultModelsCl(input_df = X_sm -# , target = y_sm -# , var_type = 'numerical' -# , skf_cv = skf_cv) -# sm_all = pd.DataFrame(mm_skf_scoresD2) -# sm_all = sm_all.T -# sm_CT = sm_all.filter(like='test_', axis=1) -#sm_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) - -# sm_BT = sm_all.filter(like='bts_', axis=1) -#sm_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) - -#%% SMOTE ENN: Over + Undersampling combined: Numerical ONLY -# mm_skf_scoresD5 = MultModelsCl(input_df = X_enn -# , target = y_enn -# , var_type = 'numerical' -# , skf_cv = skf_cv -# , blind_test_input_df = X_bts -# , blind_test_target = y_bts) -# enn_all = pd.DataFrame(mm_skf_scoresD5) -# enn_all = enn_all.T - -# enn_CT = enn_all.filter(like='test_', axis=1) -#enn_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) - -# enn_BT = enn_all.filter(like='bts_', axis=1) -#enn_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) #%% SMOTE NC: Oversampling [Numerical + categorical] mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc , target = y_smnc @@ -97,7 +101,7 @@ rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) rus_BT = rus_all.filter(like='bts_' , axis=1) rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) #%% ROS + RUS Combined: Numerical + categorical -mm_skf_scoresD8= MultModelsCl(input_df = X_rouC +mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC , target = y_rouC , var_type = 'mixed' , skf_cv = skf_cv @@ -106,12 +110,43 @@ mm_skf_scoresD8= MultModelsCl(input_df = X_rouC rouC_all = pd.DataFrame(mm_skf_scoresD8) rouC_all = rouC_all.T -rouC_CT = ros_all.filter(like='test_', axis=1) +rouC_CT = rouC_all.filter(like='test_', axis=1) rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) -rouC_BT = ros_all.filter(like='bts_', axis=1) +rouC_BT = rouC_all.filter(like='bts_', axis=1) rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) -#%% + +#%% SMOTE OS: Numerical only +# mm_skf_scoresD2 = MultModelsCl(input_df = X_sm +# , target = y_sm +# , var_type = 'numerical' +# , skf_cv = skf_cv) +# sm_all = pd.DataFrame(mm_skf_scoresD2) +# sm_all = sm_all.T + +# sm_CT = sm_all.filter(like='test_', axis=1) +#sm_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +# sm_BT = sm_all.filter(like='bts_', axis=1) +#sm_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +#%% SMOTE ENN: Over + Undersampling combined: Numerical ONLY +# mm_skf_scoresD5 = MultModelsCl(input_df = X_enn +# , target = y_enn +# , var_type = 'numerical' +# , skf_cv = skf_cv +# , blind_test_input_df = X_bts +# , blind_test_target = y_bts) +# enn_all = pd.DataFrame(mm_skf_scoresD5) +# enn_all = enn_all.T + +# enn_CT = enn_all.filter(like='test_', axis=1) +#enn_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +# enn_BT = enn_all.filter(like='bts_', axis=1) +#enn_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +#%% Repeated ENN # mm_skf_scoresD6 = MultModelsCl(input_df = X_renn # , target = y_renn # , var_type = 'numerical' diff --git a/UQ_Imbalance.py b/UQ_Imbalance.py index d471e86..893c4ef 100644 --- a/UQ_Imbalance.py +++ b/UQ_Imbalance.py @@ -56,7 +56,7 @@ X_ros, y_ros = oversample.fit_resample(X, y) print(X_ros.shape) #228 #------------------------------ -# Simple Random oversampling +# Simple Random undersampling # [Numerical + catgeorical] #------------------------------ undersample = RandomUnderSampler(sampling_strategy='majority') diff --git a/UQ_MultModelsCl.py b/UQ_MultModelsCl.py index e5b55b6..942d980 100644 --- a/UQ_MultModelsCl.py +++ b/UQ_MultModelsCl.py @@ -6,89 +6,93 @@ Created on Fri Mar 4 15:25:33 2022 @author: tanu """ #%% - import os, sys import pandas as pd import numpy as np import pprint as pp -#from copy import deepcopy +from copy import deepcopy from sklearn import linear_model -from sklearn.linear_model import LogisticRegression, LinearRegression +from sklearn import datasets +from collections import Counter + +from sklearn.linear_model import LogisticRegression, LogisticRegressionCV +from sklearn.linear_model import RidgeClassifier, RidgeClassifierCV, SGDClassifier, PassiveAggressiveClassifier + from sklearn.naive_bayes import BernoulliNB from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC -from sklearn.tree import DecisionTreeClassifier -from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier +from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier +from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier +from sklearn.naive_bayes import GaussianNB +from sklearn.gaussian_process import GaussianProcessClassifier, kernels +from sklearn.gaussian_process.kernels import RBF, DotProduct, Matern, RationalQuadratic, WhiteKernel + +from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis from sklearn.neural_network import MLPClassifier + +from sklearn.svm import SVC from xgboost import XGBClassifier +from sklearn.naive_bayes import MultinomialNB from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder from sklearn.compose import ColumnTransformer from sklearn.compose import make_column_transformer -from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score -from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score -from sklearn.metrics import make_scorer -from sklearn.metrics import classification_report +from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score +from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report -from sklearn.metrics import average_precision_score +from sklearn.model_selection import train_test_split, cross_validate, cross_val_score +from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold -from sklearn.model_selection import cross_validate -from sklearn.model_selection import train_test_split -from sklearn.model_selection import StratifiedKFold +from sklearn.pipeline import Pipeline, make_pipeline -from sklearn.pipeline import Pipeline -from sklearn.pipeline import make_pipeline +from sklearn.feature_selection import RFE, RFECV -from sklearn.feature_selection import RFE -from sklearn.feature_selection import RFECV import itertools import seaborn as sns import matplotlib.pyplot as plt -import numpy as np -print(np.__version__) -print(pd.__version__) + from statistics import mean, stdev, median, mode from imblearn.over_sampling import RandomOverSampler +from imblearn.under_sampling import RandomUnderSampler from imblearn.over_sampling import SMOTE -from imblearn.pipeline import Pipeline from sklearn.datasets import make_classification -from sklearn.model_selection import cross_validate -from sklearn.model_selection import RepeatedStratifiedKFold -from sklearn.ensemble import AdaBoostClassifier from imblearn.combine import SMOTEENN +from imblearn.combine import SMOTETomek + +from imblearn.over_sampling import SMOTENC from imblearn.under_sampling import EditedNearestNeighbours +from imblearn.under_sampling import RepeatedEditedNearestNeighbours -from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis -from sklearn.neural_network import MLPClassifier +from sklearn.model_selection import GridSearchCV +from sklearn.base import BaseEstimator +from sklearn.impute import KNNImputer as KNN +import json -from sklearn.linear_model import RidgeClassifier, SGDClassifier, PassiveAggressiveClassifier -from sklearn.discriminant_analysis import LinearDiscriminantAnalysis -from sklearn.svm import SVC -from xgboost import XGBClassifier -from sklearn.naive_bayes import MultinomialNB -from sklearn.naive_bayes import GaussianNB - -from sklearn.linear_model import SGDClassifier -from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder -from sklearn.utils import all_estimators - -from sklearn.linear_model import LogisticRegression, LogisticRegressionCV - -#%% +#%% GLOBALS rs = {'random_state': 42} njobs = {'n_jobs': 10} -scoring_fn = ({ 'mcc' : make_scorer(matthews_corrcoef) - , 'fscore' : make_scorer(f1_score) - , 'precision' : make_scorer(precision_score) - , 'recall' : make_scorer(recall_score) - , 'accuracy' : make_scorer(accuracy_score) - , 'roc_auc' : make_scorer(roc_auc_score) - , 'jaccard' : make_scorer(jaccard_score) - }) +scoring_fn = ({'accuracy' : make_scorer(accuracy_score) + , 'fscore' : make_scorer(f1_score) + , 'mcc' : make_scorer(matthews_corrcoef) + , 'precision' : make_scorer(precision_score) + , 'recall' : make_scorer(recall_score) + , 'roc_auc' : make_scorer(roc_auc_score) + , 'jcc' : make_scorer(jaccard_score) + }) + +skf_cv = StratifiedKFold(n_splits = 10 + #, shuffle = False, random_state= None) + , shuffle = True,**rs) +rskf_cv = RepeatedStratifiedKFold(n_splits = 10 + , n_repeats = 3 + , **rs) + +mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)} +jacc_score_fn = {'jcc': make_scorer(jaccard_score)} #%% # Multiple Classification - Model Pipeline def MultModelsCl(input_df, target, skf_cv @@ -111,9 +115,9 @@ def MultModelsCl(input_df, target, skf_cv returns Dict containing multiple classification scores for each model and mean of each Stratified Kfold including training - ''' - # determine categorical and numerical features + + # Determine categorical and numerical features numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns numerical_ix categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns @@ -133,7 +137,7 @@ def MultModelsCl(input_df, target, skf_cv col_transform = ColumnTransformer(transformers = t , remainder='passthrough') - #%% Specify multiple Classification models + # Specify multiple Classification models lr = LogisticRegression(**rs) lrcv = LogisticRegressionCV(**rs) gnb = GaussianNB() diff --git a/UQ_pnca_ML.py b/UQ_pnca_ML.py index 8136892..66b739a 100644 --- a/UQ_pnca_ML.py +++ b/UQ_pnca_ML.py @@ -16,11 +16,29 @@ import pprint as pp from copy import deepcopy from collections import Counter from sklearn.impute import KNNImputer as KNN +from imblearn.over_sampling import RandomOverSampler +from imblearn.under_sampling import RandomUnderSampler +from imblearn.over_sampling import SMOTE +from sklearn.datasets import make_classification +from imblearn.combine import SMOTEENN +from imblearn.combine import SMOTETomek + +from imblearn.over_sampling import SMOTENC +from imblearn.under_sampling import EditedNearestNeighbours +from imblearn.under_sampling import RepeatedEditedNearestNeighbours + +#%% REMOVE once config is set up +from UQ_MultModelsCl import MultModelsCl + +rs = {'random_state': 42} +njobs = {'n_jobs': 10} #%% +homedir = os.path.expanduser("~") + #============== # directories -#============== +#==============a datadir = homedir + '/git/Data/' indir = datadir + drug + '/input/' outdir = datadir + drug + '/output/' @@ -122,12 +140,12 @@ common_cols_stabiltyN = ['ligand_distance' , 'ddg_dynamut2'] foldX_cols = ['contacts' -#, 'electro_rr', 'electro_mm', 'electro_sm', 'electro_ss' -#, 'disulfide_rr', 'disulfide_mm', 'disulfide_sm', 'disulfide_ss' -#, 'hbonds_rr', 'hbonds_mm', 'hbonds_sm', 'hbonds_ss' -#, 'partcov_rr', 'partcov_mm', 'partcov_sm', 'partcov_ss' -#, 'vdwclashes_rr', 'vdwclashes_mm', 'vdwclashes_sm', 'vdwclashes_ss' -#, 'volumetric_rr', 'volumetric_mm', 'volumetric_ss' +, 'electro_rr', 'electro_mm', 'electro_sm', 'electro_ss' +, 'disulfide_rr', 'disulfide_mm', 'disulfide_sm', 'disulfide_ss' +, 'hbonds_rr', 'hbonds_mm', 'hbonds_sm', 'hbonds_ss' +, 'partcov_rr', 'partcov_mm', 'partcov_sm', 'partcov_ss' +, 'vdwclashes_rr', 'vdwclashes_mm', 'vdwclashes_sm', 'vdwclashes_ss' +, 'volumetric_rr', 'volumetric_mm', 'volumetric_ss' ] X_strFN = ['rsa' @@ -196,7 +214,6 @@ all_df_wtgt = training_df[numerical_FN + categorical_FN + ['dst_mode']] all_df_wtgt.shape #%%================================================================ #%% Apply ML -#TODO: A #%% Data #------ @@ -222,17 +239,89 @@ X_bts_wt = blind_test_df[numerical_FN + ['dst_mode']] # Quick check (X['ligand_affinity_change']==0).sum() == (X['ligand_distance']>10).sum() -#%% MultClassPipeSKFCV: function call() -# mm_skf_scoresD = MultClassPipeSKFCV(input_df = X -# , target = y -# , var_type = 'numerical' -# , skf_cv = skf_cv) +############################################################################## +print('Original Data\n', Counter(y) + , 'Data dim:', X.shape) +############################################################################### +#%% +############################################################################ +# RESAMPLING +############################################################################### +#------------------------------ +# Simple Random oversampling +# [Numerical + catgeorical] +#------------------------------ +oversample = RandomOverSampler(sampling_strategy='minority') +X_ros, y_ros = oversample.fit_resample(X, y) +print('Simple Random OverSampling\n', Counter(y_ros)) +print(X_ros.shape) -# mm_skf_scores_df_all = pd.DataFrame(mm_skf_scoresD) -# mm_skf_scores_df_all -# mm_skf_scores_df_test = mm_skf_scores_df_all.filter(like='test_', axis=0) -# mm_skf_scores_df_train = mm_skf_scores_df_all.filter(like='train_', axis=0) # helps to see if you trust the results -# print(mm_skf_scores_df_train) -# print(mm_skf_scores_df_test) +#------------------------------ +# Simple Random Undersampling +# [Numerical + catgeorical] +#------------------------------ +undersample = RandomUnderSampler(sampling_strategy='majority') +X_rus, y_rus = undersample.fit_resample(X, y) +print('Simple Random UnderSampling\n', Counter(y_rus)) +print(X_rus.shape) +#------------------------------ +# Simple combine ROS and RUS +# [Numerical + catgeorical] +#------------------------------ +oversample = RandomOverSampler(sampling_strategy='minority') +X_ros, y_ros = oversample.fit_resample(X, y) +undersample = RandomUnderSampler(sampling_strategy='majority') +X_rouC, y_rouC = undersample.fit_resample(X_ros, y_ros) +print('Simple Combined Over and UnderSampling\n', Counter(y_rouC)) +print(X_rouC.shape) + +#------------------------------ +# SMOTE_NC: oversampling +# [numerical + categorical] +#https://stackoverflow.com/questions/47655813/oversampling-smote-for-binary-and-categorical-data-in-python +#------------------------------ +# Determine categorical and numerical features +numerical_ix = X.select_dtypes(include=['int64', 'float64']).columns +numerical_ix +num_featuresL = list(numerical_ix) +numerical_colind = X.columns.get_indexer(list(numerical_ix) ) +numerical_colind + +categorical_ix = X.select_dtypes(include=['object', 'bool']).columns +categorical_ix +categorical_colind = X.columns.get_indexer(list(categorical_ix)) +categorical_colind + +k_sm = 5 # 5 is deafult +sm_nc = SMOTENC(categorical_features=categorical_colind, k_neighbors = k_sm, **rs, **njobs) +X_smnc, y_smnc = sm_nc.fit_resample(X, y) +print('SMOTE_NC OverSampling\n', Counter(y_smnc)) +print(X_smnc.shape) + +############################################################################### +#%% SMOTE RESAMPLING for NUMERICAL ONLY* +# #------------------------------ +# # SMOTE: Oversampling +# # [Numerical ONLY] +# #------------------------------ +# k_sm = 1 +# sm = SMOTE(sampling_strategy = 'auto', k_neighbors = k_sm, **rs) +# X_sm, y_sm = sm.fit_resample(X, y) +# print(X_sm.shape) +# print('SMOTE OverSampling\n', Counter(y_sm)) +# y_sm_df = y_sm.to_frame() +# y_sm_df.value_counts().plot(kind = 'bar') + +# #------------------------------ +# # SMOTE: Over + Undersampling COMBINED +# # [Numerical ONLY] +# #----------------------------- +# sm_enn = SMOTEENN(enn=EditedNearestNeighbours(sampling_strategy='all', **rs, **njobs )) +# X_enn, y_enn = sm_enn.fit_resample(X, y) +# print(X_enn.shape) +# print('SMOTE Over+Under Sampling combined\n', Counter(y_enn)) + +############################################################################### +# TODO: Find over and undersampling JUST for categorical data diff --git a/pnca_config.py b/pnca_config.py index 705fecf..d367781 100644 --- a/pnca_config.py +++ b/pnca_config.py @@ -20,11 +20,9 @@ MyGlobalVars() os.chdir(homedir + "/git/ML_AI_training/") # my function -from UQ_MultClassPipe4 import MultClassPipeSKFCV +from UQ_MultModelsCl import MultModelsCl from UQ_pnca_ML.py import * -#from scriptsfymcn import run_all_ML - # YC_resD2 = run_all_ML(input_pd=X, target_label=y, blind_test_input_df=X_bts, blind_test_target=y_bts, preprocess = True, var_type = 'mixed')