adding prints to pnca_config file

This commit is contained in:
Tanushree Tunstall 2022-05-29 06:58:21 +01:00
parent 693a5324c1
commit c91a994828
2 changed files with 132 additions and 20 deletions

View file

@ -10,7 +10,7 @@ import os
gene = 'pncA'
drug = 'pyrazinamide'
total_mtblineage_u = 8
#total_mtblineage_u = 8
homedir = os.path.expanduser("~")
@ -22,5 +22,29 @@ from UQ_ML_data import *
# from YC run_all_ML: run locally
from UQ_MultModelsCl import MultModelsCl
print('TESTING cmd:'
, '\nGene name:', gene
, '\nDrug name:', drug
, '\nTotal input features:', X.shape
, '\n', Counter(y))
print('TESTING cmd:', Counter(y))
print('Strucutral features (n):'
, len(common_cols_stabiltyN) + len(foldX_cols) + len(X_strFN)
, '\nThese are:'
, '\nCommon stablity features:', common_cols_stabiltyN
, '\nFoldX columns:', foldX_cols
, '\nOther struc columns:', X_strFN
, '\n================================================================\n')
print('Evolutionary features (n):'
, len(X_evolFN)
, '\nThese are:\n'
, X_evolFN
, '\n================================================================\n')
print('Genomic features (n):'
, len(X_genomicFN)
, '\nThese are:\n'
, X_genomic_mafor, '\n'
, X_genomic_linegae
, '\n================================================================\n')

View file

@ -26,6 +26,72 @@ from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, LeaveOneOut, KFold, RepeatedKFold, cross_val_predict
# Metric
from sklearn.metrics import mean_squared_error, make_scorer, roc_auc_score, f1_score, matthews_corrcoef, accuracy_score, balanced_accuracy_score, confusion_matrix, classification_report
###############################################################################
# TT imports
from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score
from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold
from copy import deepcopy
from sklearn import linear_model
from sklearn import datasets
from collections import Counter
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.linear_model import RidgeClassifier, RidgeClassifierCV, SGDClassifier, PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.gaussian_process import GaussianProcessClassifier, kernels
from sklearn.gaussian_process.kernels import RBF, DotProduct, Matern, RationalQuadratic, WhiteKernel
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score
from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_selection import RFE, RFECV
import itertools
import seaborn as sns
import matplotlib.pyplot as plt
from statistics import mean, stdev, median, mode
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from sklearn.datasets import make_classification
from imblearn.combine import SMOTEENN
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import SMOTENC
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.under_sampling import RepeatedEditedNearestNeighbours
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator
from sklearn.impute import KNNImputer as KNN
import json
##############################################################################
# other vars
rs = {'random_state': 42}
@ -119,10 +185,25 @@ def run_all_ML(input_pd, target_label, blind_test_input_df, blind_test_target, p
_roc_auc = round(roc_auc_score(y_pred, y), 3)
_tn, _fp, _fn, _tp = confusion_matrix(y_pred, y).ravel()
result_pd = result_pd.append(pd.DataFrame(np.column_stack([name, _tp, _tn, _fp, _fn, _roc_auc, _mcc, _bacc, _f1]),\
columns=['estimator', 'TP', 'TN', 'FP', 'FN',
'roc_auc', 'matthew', 'bacc', 'f1']),\
ignore_index=True)
# result_pd = result_pd.append(pd.DataFrame(np.column_stack([name
# , _tp, _tn
# , _fp , _fn
# , _roc_auc
# , _mcc
# , _bacc, _f1]),\
# columns=['estimator', 'TP', 'TN', 'FP', 'FN',
# 'roc_auc', 'matthew', 'bacc', 'f1']),\
# ignore_index=True)
result_pd = result_pd.append(pd.DataFrame(np.column_stack([name
, _mcc
, _roc_auc
, _bacc, _f1
, _tp, _tn
, _fp , _fn]),\
columns=['estimator', 'matthew', 'roc_auc', 'bacc', 'f1',\
'TP', 'TN', 'FP', 'FN']),\
ignore_index=True)
#=========================
# Blind test: BTS results
#=========================
@ -140,15 +221,24 @@ def run_all_ML(input_pd, target_label, blind_test_input_df, blind_test_target, p
_roc_aucBTS = round(roc_auc_score(bts_predict, blind_test_target), 3)
_tnBTS, _fpBTS, _fnBTS, _tpBTS = confusion_matrix(bts_predict, blind_test_target).ravel()
# result_bts_pd = result_bts_pd.append(pd.DataFrame(np.column_stack([name
# , _tpBTS, _tnBTS
# , _fpBTS, _fnBTS
# , _roc_aucBTS
# , _mccBTS
# , _baccBTS, _f1BTS]),\
# columns=['estimator', 'TP', 'TN', 'FP', 'FN',
# 'roc_auc', 'matthew', 'bacc', 'f1']),\
# ignore_index=True)
result_bts_pd = result_bts_pd.append(pd.DataFrame(np.column_stack([name
, _tpBTS, _tnBTS
, _fpBTS, _fnBTS
, _roc_aucBTS
, _mccBTS
, _baccBTS, _f1BTS]),\
columns=['estimator', 'TP', 'TN', 'FP', 'FN',
'roc_auc', 'matthew', 'bacc', 'f1']),\
ignore_index=True)
, _roc_aucBTS
, _baccBTS, _f1BTS
, _tpBTS, _tnBTS
, _fpBTS, _fnBTS]),\
columns=['estimator','matthew', 'roc_auc', 'bacc', 'f1',\
'TP', 'TN', 'FP', 'FN']),\
ignore_index=True)
results_all['CrossValResultsDF'] = result_pd
@ -165,15 +255,13 @@ def run_all_ML(input_pd, target_label, blind_test_input_df, blind_test_target, p
#%% CALL function
#run_all_ML(input_pd=X, target_label=y, blind_test_input_df=X_bts, blind_test_target=y_bts, preprocess = True, var_type = 'mixed')
# Baseline_data
YC_resD2 = run_all_ML(input_pd=X, target_label=y, blind_test_input_df=X_bts, blind_test_target=y_bts, preprocess = True, var_type = 'mixed')
YC_resD_ros = run_all_ML(input_pd=X_ros, target_label=y_ros, blind_test_input_df=X_bts, blind_test_target=y_bts, preprocess = True, var_type = 'mixed')
CVResultsDF = YC_resD2['CrossValResultsDF']
CVResultsDF.sort_values(by=['matthew'], ascending=False, inplace=True)
BTSResultsDF = YC_resD2['BlindTestResultsDF']
BTSResultsDF.sort_values(by=['matthew'], ascending=False, inplace=True)
CVResultsDF_baseline = YC_resD2['CrossValResultsDF']
CVResultsDF_baseline.sort_values(by=['matthew'], ascending=False, inplace=True)
BTSResultsDF_baseline = YC_resD2['BlindTestResultsDF']
BTSResultsDF_baseline.sort_values(by=['matthew'], ascending=False, inplace=True)
# from sklearn.utils import all_estimators
# for name, algorithm in all_estimators(type_filter="classifier"):