saving and organising work to call form cmd line

This commit is contained in:
Tanushree Tunstall 2022-05-28 11:25:04 +01:00
parent d9a1888e8c
commit f2634f77ef
5 changed files with 232 additions and 106 deletions

View file

@ -6,6 +6,38 @@ Created on Tue Mar 15 11:09:50 2022
@author: tanu
"""
from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score
from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold
from sklearn.pipeline import Pipeline, make_pipeline
#%% GLOBALS
rs = {'random_state': 42}
njobs = {'n_jobs': 10}
scoring_fn = ({'accuracy' : make_scorer(accuracy_score)
, 'fscore' : make_scorer(f1_score)
, 'mcc' : make_scorer(matthews_corrcoef)
, 'precision' : make_scorer(precision_score)
, 'recall' : make_scorer(recall_score)
, 'roc_auc' : make_scorer(roc_auc_score)
, 'jcc' : make_scorer(jaccard_score)
})
skf_cv = StratifiedKFold(n_splits = 10
#, shuffle = False, random_state= None)
, shuffle = True,**rs)
rskf_cv = RepeatedStratifiedKFold(n_splits = 10
, n_repeats = 3
, **rs)
mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)}
jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
###############################################################################
#%% MultModelsCl: function call()
mm_skf_scoresD = MultModelsCl(input_df = X
, target = y
@ -22,35 +54,7 @@ baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
baseline_BT = baseline_all.filter(like='bts_', axis=1)
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
#%% SMOTE OS: Numerical only
# mm_skf_scoresD2 = MultModelsCl(input_df = X_sm
# , target = y_sm
# , var_type = 'numerical'
# , skf_cv = skf_cv)
# sm_all = pd.DataFrame(mm_skf_scoresD2)
# sm_all = sm_all.T
# sm_CT = sm_all.filter(like='test_', axis=1)
#sm_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
# sm_BT = sm_all.filter(like='bts_', axis=1)
#sm_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
#%% SMOTE ENN: Over + Undersampling combined: Numerical ONLY
# mm_skf_scoresD5 = MultModelsCl(input_df = X_enn
# , target = y_enn
# , var_type = 'numerical'
# , skf_cv = skf_cv
# , blind_test_input_df = X_bts
# , blind_test_target = y_bts)
# enn_all = pd.DataFrame(mm_skf_scoresD5)
# enn_all = enn_all.T
# enn_CT = enn_all.filter(like='test_', axis=1)
#enn_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
# enn_BT = enn_all.filter(like='bts_', axis=1)
#enn_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
#%% SMOTE NC: Oversampling [Numerical + categorical]
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
, target = y_smnc
@ -97,7 +101,7 @@ rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rus_BT = rus_all.filter(like='bts_' , axis=1)
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
#%% ROS + RUS Combined: Numerical + categorical
mm_skf_scoresD8= MultModelsCl(input_df = X_rouC
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
, target = y_rouC
, var_type = 'mixed'
, skf_cv = skf_cv
@ -106,12 +110,43 @@ mm_skf_scoresD8= MultModelsCl(input_df = X_rouC
rouC_all = pd.DataFrame(mm_skf_scoresD8)
rouC_all = rouC_all.T
rouC_CT = ros_all.filter(like='test_', axis=1)
rouC_CT = rouC_all.filter(like='test_', axis=1)
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rouC_BT = ros_all.filter(like='bts_', axis=1)
rouC_BT = rouC_all.filter(like='bts_', axis=1)
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
#%%
#%% SMOTE OS: Numerical only
# mm_skf_scoresD2 = MultModelsCl(input_df = X_sm
# , target = y_sm
# , var_type = 'numerical'
# , skf_cv = skf_cv)
# sm_all = pd.DataFrame(mm_skf_scoresD2)
# sm_all = sm_all.T
# sm_CT = sm_all.filter(like='test_', axis=1)
#sm_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
# sm_BT = sm_all.filter(like='bts_', axis=1)
#sm_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
#%% SMOTE ENN: Over + Undersampling combined: Numerical ONLY
# mm_skf_scoresD5 = MultModelsCl(input_df = X_enn
# , target = y_enn
# , var_type = 'numerical'
# , skf_cv = skf_cv
# , blind_test_input_df = X_bts
# , blind_test_target = y_bts)
# enn_all = pd.DataFrame(mm_skf_scoresD5)
# enn_all = enn_all.T
# enn_CT = enn_all.filter(like='test_', axis=1)
#enn_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
# enn_BT = enn_all.filter(like='bts_', axis=1)
#enn_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
#%% Repeated ENN
# mm_skf_scoresD6 = MultModelsCl(input_df = X_renn
# , target = y_renn
# , var_type = 'numerical'

View file

@ -56,7 +56,7 @@ X_ros, y_ros = oversample.fit_resample(X, y)
print(X_ros.shape) #228
#------------------------------
# Simple Random oversampling
# Simple Random undersampling
# [Numerical + catgeorical]
#------------------------------
undersample = RandomUnderSampler(sampling_strategy='majority')

View file

@ -6,89 +6,93 @@ Created on Fri Mar 4 15:25:33 2022
@author: tanu
"""
#%%
import os, sys
import pandas as pd
import numpy as np
import pprint as pp
#from copy import deepcopy
from copy import deepcopy
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn import datasets
from collections import Counter
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.linear_model import RidgeClassifier, RidgeClassifierCV, SGDClassifier, PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.gaussian_process import GaussianProcessClassifier, kernels
from sklearn.gaussian_process.kernels import RBF, DotProduct, Matern, RationalQuadratic, WhiteKernel
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score
from sklearn.metrics import make_scorer
from sklearn.metrics import classification_report
from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score
from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report
from sklearn.metrics import average_precision_score
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import RFE, RFECV
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
import itertools
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
print(np.__version__)
print(pd.__version__)
from statistics import mean, stdev, median, mode
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import AdaBoostClassifier
from imblearn.combine import SMOTEENN
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import SMOTENC
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.under_sampling import RepeatedEditedNearestNeighbours
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator
from sklearn.impute import KNNImputer as KNN
import json
from sklearn.linear_model import RidgeClassifier, SGDClassifier, PassiveAggressiveClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.utils import all_estimators
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
#%%
#%% GLOBALS
rs = {'random_state': 42}
njobs = {'n_jobs': 10}
scoring_fn = ({ 'mcc' : make_scorer(matthews_corrcoef)
scoring_fn = ({'accuracy' : make_scorer(accuracy_score)
, 'fscore' : make_scorer(f1_score)
, 'mcc' : make_scorer(matthews_corrcoef)
, 'precision' : make_scorer(precision_score)
, 'recall' : make_scorer(recall_score)
, 'accuracy' : make_scorer(accuracy_score)
, 'roc_auc' : make_scorer(roc_auc_score)
, 'jaccard' : make_scorer(jaccard_score)
, 'jcc' : make_scorer(jaccard_score)
})
skf_cv = StratifiedKFold(n_splits = 10
#, shuffle = False, random_state= None)
, shuffle = True,**rs)
rskf_cv = RepeatedStratifiedKFold(n_splits = 10
, n_repeats = 3
, **rs)
mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)}
jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
#%%
# Multiple Classification - Model Pipeline
def MultModelsCl(input_df, target, skf_cv
@ -111,9 +115,9 @@ def MultModelsCl(input_df, target, skf_cv
returns
Dict containing multiple classification scores for each model and mean of each Stratified Kfold including training
'''
# determine categorical and numerical features
# Determine categorical and numerical features
numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
numerical_ix
categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns
@ -133,7 +137,7 @@ def MultModelsCl(input_df, target, skf_cv
col_transform = ColumnTransformer(transformers = t
, remainder='passthrough')
#%% Specify multiple Classification models
# Specify multiple Classification models
lr = LogisticRegression(**rs)
lrcv = LogisticRegressionCV(**rs)
gnb = GaussianNB()

View file

@ -16,11 +16,29 @@ import pprint as pp
from copy import deepcopy
from collections import Counter
from sklearn.impute import KNNImputer as KNN
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from sklearn.datasets import make_classification
from imblearn.combine import SMOTEENN
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import SMOTENC
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.under_sampling import RepeatedEditedNearestNeighbours
#%% REMOVE once config is set up
from UQ_MultModelsCl import MultModelsCl
rs = {'random_state': 42}
njobs = {'n_jobs': 10}
#%%
homedir = os.path.expanduser("~")
#==============
# directories
#==============
#==============a
datadir = homedir + '/git/Data/'
indir = datadir + drug + '/input/'
outdir = datadir + drug + '/output/'
@ -122,12 +140,12 @@ common_cols_stabiltyN = ['ligand_distance'
, 'ddg_dynamut2']
foldX_cols = ['contacts'
#, 'electro_rr', 'electro_mm', 'electro_sm', 'electro_ss'
#, 'disulfide_rr', 'disulfide_mm', 'disulfide_sm', 'disulfide_ss'
#, 'hbonds_rr', 'hbonds_mm', 'hbonds_sm', 'hbonds_ss'
#, 'partcov_rr', 'partcov_mm', 'partcov_sm', 'partcov_ss'
#, 'vdwclashes_rr', 'vdwclashes_mm', 'vdwclashes_sm', 'vdwclashes_ss'
#, 'volumetric_rr', 'volumetric_mm', 'volumetric_ss'
, 'electro_rr', 'electro_mm', 'electro_sm', 'electro_ss'
, 'disulfide_rr', 'disulfide_mm', 'disulfide_sm', 'disulfide_ss'
, 'hbonds_rr', 'hbonds_mm', 'hbonds_sm', 'hbonds_ss'
, 'partcov_rr', 'partcov_mm', 'partcov_sm', 'partcov_ss'
, 'vdwclashes_rr', 'vdwclashes_mm', 'vdwclashes_sm', 'vdwclashes_ss'
, 'volumetric_rr', 'volumetric_mm', 'volumetric_ss'
]
X_strFN = ['rsa'
@ -196,7 +214,6 @@ all_df_wtgt = training_df[numerical_FN + categorical_FN + ['dst_mode']]
all_df_wtgt.shape
#%%================================================================
#%% Apply ML
#TODO: A
#%% Data
#------
@ -222,17 +239,89 @@ X_bts_wt = blind_test_df[numerical_FN + ['dst_mode']]
# Quick check
(X['ligand_affinity_change']==0).sum() == (X['ligand_distance']>10).sum()
#%% MultClassPipeSKFCV: function call()
# mm_skf_scoresD = MultClassPipeSKFCV(input_df = X
# , target = y
# , var_type = 'numerical'
# , skf_cv = skf_cv)
##############################################################################
print('Original Data\n', Counter(y)
, 'Data dim:', X.shape)
###############################################################################
#%%
############################################################################
# RESAMPLING
###############################################################################
#------------------------------
# Simple Random oversampling
# [Numerical + catgeorical]
#------------------------------
oversample = RandomOverSampler(sampling_strategy='minority')
X_ros, y_ros = oversample.fit_resample(X, y)
print('Simple Random OverSampling\n', Counter(y_ros))
print(X_ros.shape)
# mm_skf_scores_df_all = pd.DataFrame(mm_skf_scoresD)
# mm_skf_scores_df_all
# mm_skf_scores_df_test = mm_skf_scores_df_all.filter(like='test_', axis=0)
# mm_skf_scores_df_train = mm_skf_scores_df_all.filter(like='train_', axis=0) # helps to see if you trust the results
# print(mm_skf_scores_df_train)
# print(mm_skf_scores_df_test)
#------------------------------
# Simple Random Undersampling
# [Numerical + catgeorical]
#------------------------------
undersample = RandomUnderSampler(sampling_strategy='majority')
X_rus, y_rus = undersample.fit_resample(X, y)
print('Simple Random UnderSampling\n', Counter(y_rus))
print(X_rus.shape)
#------------------------------
# Simple combine ROS and RUS
# [Numerical + catgeorical]
#------------------------------
oversample = RandomOverSampler(sampling_strategy='minority')
X_ros, y_ros = oversample.fit_resample(X, y)
undersample = RandomUnderSampler(sampling_strategy='majority')
X_rouC, y_rouC = undersample.fit_resample(X_ros, y_ros)
print('Simple Combined Over and UnderSampling\n', Counter(y_rouC))
print(X_rouC.shape)
#------------------------------
# SMOTE_NC: oversampling
# [numerical + categorical]
#https://stackoverflow.com/questions/47655813/oversampling-smote-for-binary-and-categorical-data-in-python
#------------------------------
# Determine categorical and numerical features
numerical_ix = X.select_dtypes(include=['int64', 'float64']).columns
numerical_ix
num_featuresL = list(numerical_ix)
numerical_colind = X.columns.get_indexer(list(numerical_ix) )
numerical_colind
categorical_ix = X.select_dtypes(include=['object', 'bool']).columns
categorical_ix
categorical_colind = X.columns.get_indexer(list(categorical_ix))
categorical_colind
k_sm = 5 # 5 is deafult
sm_nc = SMOTENC(categorical_features=categorical_colind, k_neighbors = k_sm, **rs, **njobs)
X_smnc, y_smnc = sm_nc.fit_resample(X, y)
print('SMOTE_NC OverSampling\n', Counter(y_smnc))
print(X_smnc.shape)
###############################################################################
#%% SMOTE RESAMPLING for NUMERICAL ONLY*
# #------------------------------
# # SMOTE: Oversampling
# # [Numerical ONLY]
# #------------------------------
# k_sm = 1
# sm = SMOTE(sampling_strategy = 'auto', k_neighbors = k_sm, **rs)
# X_sm, y_sm = sm.fit_resample(X, y)
# print(X_sm.shape)
# print('SMOTE OverSampling\n', Counter(y_sm))
# y_sm_df = y_sm.to_frame()
# y_sm_df.value_counts().plot(kind = 'bar')
# #------------------------------
# # SMOTE: Over + Undersampling COMBINED
# # [Numerical ONLY]
# #-----------------------------
# sm_enn = SMOTEENN(enn=EditedNearestNeighbours(sampling_strategy='all', **rs, **njobs ))
# X_enn, y_enn = sm_enn.fit_resample(X, y)
# print(X_enn.shape)
# print('SMOTE Over+Under Sampling combined\n', Counter(y_enn))
###############################################################################
# TODO: Find over and undersampling JUST for categorical data

View file

@ -20,11 +20,9 @@ MyGlobalVars()
os.chdir(homedir + "/git/ML_AI_training/")
# my function
from UQ_MultClassPipe4 import MultClassPipeSKFCV
from UQ_MultModelsCl import MultModelsCl
from UQ_pnca_ML.py import *
#from scriptsfymcn import run_all_ML
# YC_resD2 = run_all_ML(input_pd=X, target_label=y, blind_test_input_df=X_bts, blind_test_target=y_bts, preprocess = True, var_type = 'mixed')