From ee163d3978cdf6f9dfad3eb5cb7690d287cc326c Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Thu, 19 May 2022 02:37:00 +0100 Subject: [PATCH] added UQ scripts to do hyperparam ML models --- UQ_LR.py | 207 ++++++++++++++++++++++++++++++++++++ UQ_RF.py | 140 ++++++++++++++++++++++++ UQ_pnca_ML.py | 287 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 634 insertions(+) create mode 100644 UQ_LR.py create mode 100644 UQ_RF.py create mode 100644 UQ_pnca_ML.py diff --git a/UQ_LR.py b/UQ_LR.py new file mode 100644 index 0000000..9f20f32 --- /dev/null +++ b/UQ_LR.py @@ -0,0 +1,207 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon May 16 05:59:12 2022 + +@author: tanu +""" +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Mar 15 11:09:50 2022 + +@author: tanu +""" +#%% Import libs +import numpy as np +import pandas as pd +from sklearn.model_selection import GridSearchCV +from sklearn import datasets +from sklearn.ensemble import ExtraTreesClassifier +from sklearn.ensemble import RandomForestClassifier +from sklearn.ensemble import AdaBoostClassifier +from sklearn.ensemble import GradientBoostingClassifier +from sklearn.svm import SVC + +from sklearn.base import BaseEstimator +from sklearn.naive_bayes import MultinomialNB +from sklearn.linear_model import SGDClassifier +from sklearn.pipeline import Pipeline +from sklearn.model_selection import GridSearchCV +from sklearn.linear_model import LogisticRegression +from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder +from xgboost import XGBClassifier +rs = {'random_state': 42} +njobs = {'n_jobs': 10} +#%% Get train-test split and scoring functions +# X_train, X_test, y_train, y_test = train_test_split(num_df_wtgt[numerical_FN] +# , num_df_wtgt['mutation_class'] +# , test_size = 0.33 +# , random_state = 2 +# , shuffle = True +# , stratify = num_df_wtgt['mutation_class']) + +y.to_frame().value_counts().plot(kind = 'bar') +blind_test_df['dst_mode'].to_frame().value_counts().plot(kind = 'bar') + +scoring_fn = ({'accuracy' : make_scorer(accuracy_score) + , 'fscore' : make_scorer(f1_score) + , 'mcc' : make_scorer(matthews_corrcoef) + , 'precision' : make_scorer(precision_score) + , 'recall' : make_scorer(recall_score) + , 'roc_auc' : make_scorer(roc_auc_score) + , 'jaccard' : make_scorer(jaccard_score) + }) + +mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)} +jacc_score_fn = {'jcc': make_scorer(jaccard_score)} + +#%% Logistic Regression + hyperparam: BaseEstimator: ClfSwitcher() +class ClfSwitcher(BaseEstimator): + def __init__( + self, + estimator = SGDClassifier(), + ): + """ + A Custom BaseEstimator that can switch between classifiers. + :param estimator: sklearn object - The classifier + """ + self.estimator = estimator + + def fit(self, X, y=None, **kwargs): + self.estimator.fit(X, y) + return self + + def predict(self, X, y=None): + return self.estimator.predict(X) + + def predict_proba(self, X): + return self.estimator.predict_proba(X) + + def score(self, X, y): + return self.estimator.score(X, y) + +parameters = [ + { + 'clf__estimator': [LogisticRegression(**rs)], + #'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], + 'clf__estimator__C': np.logspace(0, 4, 10), + 'clf__estimator__penalty': ['none', 'l1', 'l2', 'elasticnet'], + 'clf__estimator__max_iter': list(range(100,800,100)), + 'clf__estimator__solver': ['saga'] + }, + { + 'clf__estimator': [LogisticRegression(**rs)], + #'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], + 'clf__estimator__C': np.logspace(0, 4, 10), + 'clf__estimator__penalty': ['l2', 'none'], + 'clf__estimator__max_iter': list(range(100,800,100)), + 'clf__estimator__solver': ['newton-cg', 'lbfgs', 'sag'] + }, + { + 'clf__estimator': [LogisticRegression(**rs)], + #'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], + 'clf__estimator__C': np.logspace(0, 4, 10), + 'clf__estimator__penalty': ['l1', 'l2'], + 'clf__estimator__max_iter': list(range(100,800,100)), + 'clf__estimator__solver': ['liblinear'] + } + +] + +# Create pipeline +pipeline = Pipeline([ + ('pre', MinMaxScaler()), + ('clf', ClfSwitcher()), +]) + +# Grid search i.e hyperparameter tuning and refitting on mcc +gscv_lr = GridSearchCV(pipeline + , parameters + #, scoring = 'f1', refit = 'f1' + , scoring = mcc_score_fn, refit = 'mcc' + , cv = skf_cv + , **njobs + , return_train_score = False + , verbose = 3) + +# Fit +gscv_lr_fit = gscv_lr.fit(X, y) +gscv_lr_fit_be_mod = gscv_lr_fit.best_params_ +gscv_lr_fit_be_res = gscv_lr_fit.cv_results_ + +print('Best model:\n', gscv_lr_fit_be_mod) +print('Best models score:\n', gscv_lr_fit.best_score_, ':' , round(gscv_lr_fit.best_score_, 2)) + +#print('\nMean test score from fit results:', round(mean(gscv_lr_fit_be_res['mean_test_mcc']),2)) +print('\nMean test score from fit results:', round(np.nanmean(gscv_lr_fit_be_res['mean_test_mcc']),2)) + + +###################################### +# Blind test +###################################### +# See how it does on the BLIND test +#print('\nBlind test score, mcc:', )) + +test_predict = gscv_lr_fit.predict(X_bts) +print(test_predict) +print(np.array(y_bts)) +y_btsf = np.array(y_bts) + +print(accuracy_score(y_bts, test_predict)) +print(matthews_corrcoef(y_bts, test_predict)) + +# create a dict with all scores +lr_bts_dict = {#'best_model': list(gscv_lr_fit_be_mod.items()) + 'bts_fscore':None + , 'bts_mcc':None + , 'bts_precision':None + , 'bts_recall':None + , 'bts_accuracy':None + , 'bts_roc_auc':None + , 'bts_jaccard':None } +lr_bts_dict +lr_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2) +lr_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2) +lr_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2) +lr_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2) +lr_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2) +lr_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2) +lr_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2) +lr_bts_dict + +# Create a df from dict with all scores +pd.DataFrame.from_dict(lr_bts_dict, orient = 'index', columns = 'best_model') + +lr_bts_df = pd.DataFrame.from_dict(lr_bts_dict,orient = 'index') +lr_bts_df.columns = ['Logistic_Regression'] +print(lr_bts_df) + +# d2 = {'best_model_params': lis(gscv_lr_fit_be_mod.items() )} +# d2 +# def Merge(dict1, dict2): +# res = {**dict1, **dict2} +# return res +# d3 = Merge(d2, lr_bts_dict) +# d3 + +# Create df with best model params +model_params = pd.Series(['best_model_params', list(gscv_lr_fit_be_mod.items() )]) +model_params_df = model_params.to_frame() +model_params_df +model_params_df.columns = ['Logistic_Regression'] +model_params_df.columns + +# Combine the df of scores and the best model params +lr_bts_df.columns +lr_output = pd.concat([model_params_df, lr_bts_df], axis = 0) +lr_output + +# Format the combined df +# Drop the best_model_params row from lr_output +lr_df = lr_output.drop([0], axis = 0) +lr_df + +#FIXME: tidy the index of the formatted df + +############################################################################### diff --git a/UQ_RF.py b/UQ_RF.py new file mode 100644 index 0000000..ca3c292 --- /dev/null +++ b/UQ_RF.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed May 18 06:03:24 2022 + +@author: tanu +""" +#%% RandomForest + hyperparam: BaseEstimator: ClfSwitcher() +class ClfSwitcher(BaseEstimator): + def __init__( + self, + estimator = SGDClassifier(), + ): + """ + A Custom BaseEstimator that can switch between classifiers. + :param estimator: sklearn object - The classifier + """ + self.estimator = estimator + + def fit(self, X, y=None, **kwargs): + self.estimator.fit(X, y) + return self + + def predict(self, X, y=None): + return self.estimator.predict(X) + + def predict_proba(self, X): + return self.estimator.predict_proba(X) + + def score(self, X, y): + return self.estimator.score(X, y) + +parameters = [ + { + 'clf__estimator': [RandomForestClassifier(**rs + , **njobs + , bootstrap = True + , oob_score = True)], + 'clf__estimator__max_depth': [4, 6, 8, 10, 12, 16, 20, None] + , 'clf__estimator__class_weight':['balanced','balanced_subsample'] + , 'clf__estimator__n_estimators': [10, 25, 50, 100] + , 'clf__estimator__criterion': ['gini', 'entropy']#, 'log_loss'] + #, 'clf__estimator__max_features': ['auto', 'sqrt'] + , 'clf__estimator__min_samples_leaf': [1, 2, 3, 4, 5, 10] + , 'clf__estimator__min_samples_split': [2, 5, 15, 20] + } +] + +# Create pipeline +pipeline = Pipeline([ + ('pre', MinMaxScaler()), + ('clf', ClfSwitcher()), +]) + +# Grid search i.e hyperparameter tuning and refitting on mcc +gscv_rf = GridSearchCV(pipeline + , parameters + #, scoring = 'f1', refit = 'f1' + , scoring = mcc_score_fn, refit = 'mcc' + , cv = skf_cv + , **njobs + , return_train_score = False + , verbose = 3) + +# Fit +gscv_rf_fit = gscv_rf.fit(X, y) + +gscv_rf_fit_be_mod = gscv_rf_fit.best_params_ +gscv_rf_fit_be_res = gscv_rf_fit.cv_results_ + +print('Best model:\n', gscv_rf_fit_be_mod) +print('Best models score:\n', gscv_rf_fit.best_score_, ':' , round(gscv_rf_fit.best_score_, 2)) + +print('\nMean test score from fit results:', round(mean(gscv_rf_fit_be_re['mean_test_mcc']),2)) +print('\nMean test score from fit results:', round(np.nanmean(gscv_rf_fit_be_res['mean_test_mcc']),2)) + +###################################### +# Blind test +###################################### + +# See how it does on the BLIND test +#print('\nBlind test score, mcc:', ) + +test_predict = gscv_rf_fit.predict(X_bts) +print(test_predict) +print(np.array(y_bts)) +y_btsf = np.array(y_bts) + +print(accuracy_score(y_btsf, test_predict)) +print(matthews_corrcoef(y_btsf, test_predict)) + +# create a dict with all scores +rf_bts_dict = {#'best_model': list(gscv_rf_fit_be_mod.items()) + 'bts_fscore' : None + , 'bts_mcc' : None + , 'bts_precision': None + , 'bts_recall' : None + , 'bts_accuracy' : None + , 'bts_roc_auc' : None + , 'bts_jaccard' : None } +rf_bts_dict +rf_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2) +rf_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2) +rf_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2) +rf_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2) +rf_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2) +rf_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2) +rf_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2) +rf_bts_dict + +# Create a df from dict with all scores +pd.DataFrame.from_dict(rf_bts_dict, orient = 'index', columns = 'best_model') + +rf_bts_df = pd.DataFrame.from_dict(rf_bts_dict,orient = 'index') +rf_bts_df.columns = ['Logistic_Regression'] +print(rf_bts_df) + +# Create df with best model params +model_params = pd.Series(['best_model_params', list(gscv_rf_fit_be_mod.items() )]) +model_params_df = model_params.to_frame() +model_params_df +model_params_df.columns = ['Logistic_Regression'] +model_params_df.columns + +# Combine the df of scores and the best model params +rf_bts_df.columns +rf_output = pd.concat([model_params_df, rf_bts_df], axis = 0) +rf_output + +# Format the combined df +# Drop the best_model_params row from rf_output +rf_df = rf_output.drop([0], axis = 0) +rf_df + +#FIXME: tidy the index of the formatted df + +############################################################################### + + + diff --git a/UQ_pnca_ML.py b/UQ_pnca_ML.py new file mode 100644 index 0000000..48ab9a0 --- /dev/null +++ b/UQ_pnca_ML.py @@ -0,0 +1,287 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Sun Mar 6 13:41:54 2022 + +@author: tanu +""" +import os, sys +import pandas as pd +import numpy as np +import pprint as pp +from copy import deepcopy +from sklearn import linear_model +from sklearn.linear_model import LogisticRegression, LinearRegression +from sklearn.naive_bayes import BernoulliNB +from sklearn.neighbors import KNeighborsClassifier +from sklearn.svm import SVC +from sklearn.tree import DecisionTreeClassifier +from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier +from sklearn.ensemble import AdaBoostClassifier +from sklearn.ensemble import GradientBoostingClassifier +from sklearn.neural_network import MLPClassifier +from xgboost import XGBClassifier +from sklearn.naive_bayes import MultinomialNB +from sklearn.linear_model import SGDClassifier +from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder + +from sklearn.compose import ColumnTransformer +from sklearn.compose import make_column_transformer + +from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score +from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score +from sklearn.metrics import jaccard_score + +from sklearn.metrics import make_scorer +from sklearn.metrics import classification_report + +from sklearn.metrics import average_precision_score + +from sklearn.model_selection import cross_validate +from sklearn.model_selection import train_test_split +from sklearn.model_selection import StratifiedKFold + +from sklearn.pipeline import Pipeline +from sklearn.pipeline import make_pipeline + +from sklearn.feature_selection import RFE +from sklearn.feature_selection import RFECV +import itertools +#import seaborn as sns +import matplotlib.pyplot as plt +import numpy as np +print(np.__version__) +print(pd.__version__) +from statistics import mean, stdev, median, mode + +#from imblearn.over_sampling import RandomOverSampler +#from imblearn.over_sampling import SMOTE +#from imblearn.pipeline import Pipeline +#from sklearn.datasets import make_classification +from sklearn.model_selection import cross_validate, cross_val_score +from sklearn.model_selection import RepeatedStratifiedKFold +from sklearn.ensemble import AdaBoostClassifier +#from imblearn.combine import SMOTEENN +#from imblearn.under_sampling import EditedNearestNeighbours + +from sklearn.model_selection import GridSearchCV +from sklearn.base import BaseEstimator + +scoring_fn = ({'accuracy' : make_scorer(accuracy_score) + , 'fscore' : make_scorer(f1_score) + , 'mcc' : make_scorer(matthews_corrcoef) + , 'precision' : make_scorer(precision_score) + , 'recall' : make_scorer(recall_score) + , 'roc_auc' : make_scorer(roc_auc_score) + , 'jcc' : make_scorer(jaccard_score) + }) + +rs = {'random_state': 42} +njobs = {'n_jobs': 10} +skf_cv = StratifiedKFold(n_splits = 10 + #, shuffle = False, random_state= None) + , shuffle = True,**rs) + +rskf_cv = RepeatedStratifiedKFold(n_splits = 10 + , n_repeats=3 + #, shuffle = False, random_state= None) + #, shuffle = True + ,**rs) +#my_mcc = make_scorer({'mcc':make_scorer(matthews_corrcoef}) +mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)} + +#%% +homedir = os.path.expanduser("~") +os.chdir(homedir + "/git/ML_AI_training/") + +# my function +#from MultClassPipe import MultClassPipeline +from MultClassPipe2 import MultClassPipeline2 +from loopity_loop import MultClassPipeSKFLoop +from MultClassPipe3 import MultClassPipeSKFCV + +gene = 'pncA' +drug = 'pyrazinamide' + +#============== +# directories +#============== +datadir = homedir + '/git/Data/' +indir = datadir + drug + '/input/' +outdir = datadir + drug + '/output/' + +#======= +# input +#======= +infile_ml1 = outdir + gene.lower() + '_merged_df3.csv' +#infile_ml2 = outdir + gene.lower() + '_merged_df2.csv' + +my_df = pd.read_csv(infile_ml1, index_col = 0) +my_df.dtypes +my_df_cols = my_df.columns + +geneL_basic = ['pnca'] + +# -- CHECK script -- imports.py +#%% get cols +mycols = my_df.columns +mycols + +# change from numberic to +num_type = ['int64', 'float64'] +cat_type = ['object', 'bool'] + +# TODO: +# Treat active site aa pos as category and not numerical: This needs to be part of merged_df3! +#if my_df['active_aa_pos'].dtype in num_type: +# my_df['active_aa_pos'] = my_df['active_aa_pos'].astype(object) +# my_df['active_aa_pos'].dtype + +# -- CHECK script -- imports.py +#%%============================================================================ +#%% IMPUTE values for OR + +#%% Combine mmCSM_lig Data + +#%% Combine PROVEAN data + +#%% Combine ED logo data + +#%% Masking columns (mCSM-lig, mCSM-NA, mCSM-ppi2) values for lig_dist >10 + +# get logic from upstream! +my_df_ml = my_df.copy() + +my_df_ml['mutationinformation'][my_df['ligand_distance']>10].value_counts() +my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts() +my_df_ml.groupby(['mutationinformation'])['ligand_distance'].apply(lambda x: (x>10)).value_counts() + +my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), 'ligand_affinity_change'] = 0 +(my_df_ml['ligand_affinity_change'] == 0).sum() + +#%%============================================================================ +# Separate blind test set +my_df_ml[drug].isna().sum() + +blind_test_df = my_df_ml[my_df_ml[drug].isna()] +blind_test_df.shape + +training_df = my_df_ml[my_df_ml[drug].notna()] +training_df.shape + +# Target1: dst +training_df[drug].value_counts() +training_df['dst_mode'].value_counts() + +#%% Build X +common_cols_stabiltyN = ['ligand_distance' + , 'ligand_affinity_change' + , 'duet_stability_change' + , 'ddg_foldx' + , 'deepddg' + , 'ddg_dynamut2'] + +foldX_cols = ['contacts' +#, 'electro_rr', 'electro_mm', 'electro_sm', 'electro_ss' +#, 'disulfide_rr', 'disulfide_mm', 'disulfide_sm', 'disulfide_ss' +#, 'hbonds_rr', 'hbonds_mm', 'hbonds_sm', 'hbonds_ss' +#, 'partcov_rr', 'partcov_mm', 'partcov_sm', 'partcov_ss' +#, 'vdwclashes_rr', 'vdwclashes_mm', 'vdwclashes_sm', 'vdwclashes_ss' +#, 'volumetric_rr', 'volumetric_mm', 'volumetric_ss' +] + +X_strFN = ['rsa' + #, 'asa' + , 'kd_values' + , 'rd_values'] + +X_evolFN = ['consurf_score' + , 'snap2_score'] + +# quick inspection which lineage to use: +#foo = my_df_ml[['lineage', 'lineage_count_all', 'lineage_count_unique']] + +X_genomicFN = ['maf' +# , 'or_mychisq' +# , 'or_logistic' +# , 'or_fisher' +# , 'pval_fisher' + #, 'lineage' + , 'lineage_count_all' + , 'lineage_count_unique' +] + +#%% Construct numerical and categorical column names + +# numerical feature names +numerical_FN = common_cols_stabiltyN + foldX_cols + X_strFN + X_evolFN + X_genomicFN + +#categorical feature names +categorical_FN = ['ss_class' + , 'wt_prop_water' + # , 'lineage_labels' # misleading if using merged_df3 + , 'mut_prop_water' + , 'wt_prop_polarity' + , 'mut_prop_polarity' + , 'wt_calcprop' + , 'mut_calcprop' + #, 'active_aa_pos' + ] + +#%% extracting dfs based on numerical, categorical column names +#---------------------------------- +# WITHOUT the target var included +#---------------------------------- +num_df = training_df[numerical_FN] +num_df.shape + +cat_df = training_df[categorical_FN] +cat_df.shape + +all_df = training_df[numerical_FN + categorical_FN] +all_df.shape + +#------------------------------ +# WITH the target var included: + #'wtgt': with target +#------------------------------ +# drug and dst_mode should be the same thing +num_df_wtgt = training_df[numerical_FN + ['dst_mode']] +num_df_wtgt.shape + +cat_df_wtgt = training_df[categorical_FN + ['dst_mode']] +cat_df_wtgt.shape + +all_df_wtgt = training_df[numerical_FN + categorical_FN + ['dst_mode']] +all_df_wtgt.shape +#%%================================================================ +#%% Apply ML +#TODO: Apply oversampling! + +#%% Data +#X = all_df_wtgt[numerical_FN+categorical_FN] +X = all_df_wtgt[numerical_FN] +y = all_df_wtgt['dst_mode'] + +#Blind test data {same format} +X_bts = blind_test_df[numerical_FN] +y_bts = blind_test_df['dst_mode'] + +X_bts_wt = blind_test_df[numerical_FN + ['dst_mode']] + +# Quick check +(X['ligand_affinity_change']==0).sum() == (X['ligand_distance']>10).sum() +#%% MultClassPipeSKFCV: function call() +mm_skf_scoresD = MultClassPipeSKFCV(input_df = X + , target = y + , var_type = 'numerical' + , skf_cv = skf_cv) + + +mm_skf_scores_df_all = pd.DataFrame(mm_skf_scoresD) +mm_skf_scores_df_all +mm_skf_scores_df_test = mm_skf_scores_df_all.filter(like='test_', axis=0) +mm_skf_scores_df_train = mm_skf_scores_df_all.filter(like='train_', axis=0) # helps to see if you trust the results +print(mm_skf_scores_df_train) +print(mm_skf_scores_df_test) +