added UQ scripts to do hyperparam ML models

2022-05-19 02:37:00 +01:00 · 2022-05-19 02:37:00 +01:00 · ee163d3978
commit ee163d3978
parent 4dbc90ad44
3 changed files with 634 additions and 0 deletions
--- a/UQ_LR.py
+++ b/UQ_LR.py
@ -0,0 +1,207 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Mon May 16 05:59:12 2022
@author: tanu
 """
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Tue Mar 15 11:09:50 2022
@author: tanu
 """
 #%% Import libs
 import numpy as np
 import pandas as pd
 from sklearn.model_selection import GridSearchCV
 from sklearn import datasets
 from sklearn.ensemble import ExtraTreesClassifier
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.ensemble import AdaBoostClassifier
 from sklearn.ensemble import GradientBoostingClassifier
 from sklearn.svm import SVC
 from sklearn.base import BaseEstimator
 from sklearn.naive_bayes import MultinomialNB
 from sklearn.linear_model import SGDClassifier
 from sklearn.pipeline import Pipeline
 from sklearn.model_selection import GridSearchCV
 from sklearn.linear_model import LogisticRegression
 from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
 from xgboost import XGBClassifier
 rs = {'random_state': 42}
 njobs = {'n_jobs': 10}
 #%% Get train-test split and scoring functions
 # X_train, X_test, y_train, y_test = train_test_split(num_df_wtgt[numerical_FN]
 #                                                     , num_df_wtgt['mutation_class']
 #                                                     , test_size    = 0.33
 #                                                     , random_state = 2
 #                                                     , shuffle      = True
 #                                                     , stratify     = num_df_wtgt['mutation_class'])
 y.to_frame().value_counts().plot(kind = 'bar')
 blind_test_df['dst_mode'].to_frame().value_counts().plot(kind = 'bar')
 scoring_fn =  ({'accuracy'      : make_scorer(accuracy_score)
                 , 'fscore'     : make_scorer(f1_score)
                 , 'mcc'        : make_scorer(matthews_corrcoef)
                 ,  'precision' : make_scorer(precision_score)
                 ,  'recall'    : make_scorer(recall_score)
                 ,  'roc_auc'   : make_scorer(roc_auc_score)
                 ,  'jaccard'   : make_scorer(jaccard_score)
            })    
 mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)}
 jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
 #%% Logistic Regression + hyperparam: BaseEstimator: ClfSwitcher()
 class ClfSwitcher(BaseEstimator):
    def __init__(
        self, 
        estimator = SGDClassifier(),
    ):
        """
        A Custom BaseEstimator that can switch between classifiers.
        :param estimator: sklearn object - The classifier
        """ 
        self.estimator = estimator
    def fit(self, X, y=None, **kwargs):
        self.estimator.fit(X, y)
        return self
    def predict(self, X, y=None):
        return self.estimator.predict(X)
    def predict_proba(self, X):
        return self.estimator.predict_proba(X)
    def score(self, X, y):
        return self.estimator.score(X, y)
 parameters = [
    {
        'clf__estimator': [LogisticRegression(**rs)],
        #'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
        'clf__estimator__C': np.logspace(0, 4, 10),
        'clf__estimator__penalty': ['none', 'l1', 'l2', 'elasticnet'],
        'clf__estimator__max_iter': list(range(100,800,100)),
        'clf__estimator__solver': ['saga']
    },
    {
        'clf__estimator': [LogisticRegression(**rs)],
        #'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
        'clf__estimator__C': np.logspace(0, 4, 10),
        'clf__estimator__penalty': ['l2', 'none'],
        'clf__estimator__max_iter': list(range(100,800,100)),
        'clf__estimator__solver': ['newton-cg', 'lbfgs', 'sag']
    }, 
    {
        'clf__estimator': [LogisticRegression(**rs)],
        #'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
        'clf__estimator__C': np.logspace(0, 4, 10),
        'clf__estimator__penalty': ['l1', 'l2'],
        'clf__estimator__max_iter': list(range(100,800,100)),
        'clf__estimator__solver': ['liblinear']
    }
 ]    
 # Create pipeline
 pipeline = Pipeline([
    ('pre', MinMaxScaler()),
    ('clf', ClfSwitcher()),
 ])
 # Grid search i.e hyperparameter tuning and refitting on mcc
 gscv_lr = GridSearchCV(pipeline
                    , parameters
                    #, scoring = 'f1', refit = 'f1'
                    , scoring = mcc_score_fn, refit = 'mcc'
                    , cv = skf_cv
                    , **njobs
                    , return_train_score = False
                    , verbose = 3)
 # Fit 
 gscv_lr_fit = gscv_lr.fit(X, y)
 gscv_lr_fit_be_mod = gscv_lr_fit.best_params_
 gscv_lr_fit_be_res = gscv_lr_fit.cv_results_
 print('Best model:\n', gscv_lr_fit_be_mod)
 print('Best models score:\n', gscv_lr_fit.best_score_, ':' , round(gscv_lr_fit.best_score_, 2))
 #print('\nMean test score from fit results:', round(mean(gscv_lr_fit_be_res['mean_test_mcc']),2))
 print('\nMean test score from fit results:', round(np.nanmean(gscv_lr_fit_be_res['mean_test_mcc']),2))
 ######################################
 # Blind test
 ######################################
 # See how it does on the BLIND test
 #print('\nBlind test score, mcc:', )) 
 test_predict = gscv_lr_fit.predict(X_bts)
 print(test_predict)
 print(np.array(y_bts))
 y_btsf = np.array(y_bts)
 print(accuracy_score(y_bts, test_predict))
 print(matthews_corrcoef(y_bts, test_predict))
 # create a dict with all scores
 lr_bts_dict = {#'best_model': list(gscv_lr_fit_be_mod.items())
               'bts_fscore':None
               , 'bts_mcc':None
               , 'bts_precision':None
               , 'bts_recall':None
               , 'bts_accuracy':None
               , 'bts_roc_auc':None
               , 'bts_jaccard':None }
 lr_bts_dict
 lr_bts_dict['bts_fscore']    = round(f1_score(y_bts, test_predict),2)
 lr_bts_dict['bts_mcc']       = round(matthews_corrcoef(y_bts, test_predict),2)
 lr_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2)
 lr_bts_dict['bts_recall']    = round(recall_score(y_bts, test_predict),2)
 lr_bts_dict['bts_accuracy']  = round(accuracy_score(y_bts, test_predict),2)
 lr_bts_dict['bts_roc_auc']   = round(roc_auc_score(y_bts, test_predict),2)
 lr_bts_dict['bts_jaccard']   = round(jaccard_score(y_bts, test_predict),2)
 lr_bts_dict
 # Create a df from dict with all scores
 pd.DataFrame.from_dict(lr_bts_dict, orient = 'index', columns = 'best_model')
 lr_bts_df = pd.DataFrame.from_dict(lr_bts_dict,orient = 'index')
 lr_bts_df.columns = ['Logistic_Regression']
 print(lr_bts_df)
 # d2 = {'best_model_params': lis(gscv_lr_fit_be_mod.items() )}
 # d2
 # def Merge(dict1, dict2):
 #     res = {**dict1, **dict2}
 #     return res
 # d3 = Merge(d2, lr_bts_dict)
 # d3
 # Create df with best model params
 model_params = pd.Series(['best_model_params',  list(gscv_lr_fit_be_mod.items() )])
 model_params_df = model_params.to_frame()
 model_params_df
 model_params_df.columns = ['Logistic_Regression']
 model_params_df.columns
 # Combine the df of scores and the best model params
 lr_bts_df.columns
 lr_output = pd.concat([model_params_df, lr_bts_df], axis = 0)
 lr_output
 # Format the combined df
 # Drop the best_model_params row from lr_output
 lr_df = lr_output.drop([0], axis = 0)
 lr_df
 #FIXME: tidy the index of the formatted df
 ###############################################################################
--- a/UQ_RF.py
+++ b/UQ_RF.py
@ -0,0 +1,140 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Wed May 18 06:03:24 2022
@author: tanu
 """
 #%% RandomForest + hyperparam: BaseEstimator: ClfSwitcher()
 class ClfSwitcher(BaseEstimator):
    def __init__(
        self, 
        estimator = SGDClassifier(),
    ):
        """
        A Custom BaseEstimator that can switch between classifiers.
        :param estimator: sklearn object - The classifier
        """ 
        self.estimator = estimator
    def fit(self, X, y=None, **kwargs):
        self.estimator.fit(X, y)
        return self
    def predict(self, X, y=None):
        return self.estimator.predict(X)
    def predict_proba(self, X):
        return self.estimator.predict_proba(X)
    def score(self, X, y):
        return self.estimator.score(X, y)
 parameters = [
    {
        'clf__estimator': [RandomForestClassifier(**rs
                                                  , **njobs
                                                  , bootstrap = True
                                                  , oob_score = True)],
        'clf__estimator__max_depth': [4, 6, 8, 10, 12, 16, 20, None]
        , 'clf__estimator__class_weight':['balanced','balanced_subsample']
        , 'clf__estimator__n_estimators': [10, 25, 50, 100]
        , 'clf__estimator__criterion': ['gini', 'entropy']#, 'log_loss']
        #, 'clf__estimator__max_features': ['auto', 'sqrt']
        , 'clf__estimator__min_samples_leaf': [1, 2, 3, 4, 5, 10]
        , 'clf__estimator__min_samples_split': [2, 5, 15, 20]
        }
 ]
 # Create pipeline
 pipeline = Pipeline([
    ('pre', MinMaxScaler()),
    ('clf', ClfSwitcher()),
 ])
 # Grid search i.e hyperparameter tuning and refitting on mcc
 gscv_rf = GridSearchCV(pipeline
                    , parameters
                    #, scoring = 'f1', refit = 'f1'
                    , scoring = mcc_score_fn, refit = 'mcc'
                    , cv = skf_cv
                    , **njobs
                    , return_train_score = False
                    , verbose = 3)
 # Fit 
 gscv_rf_fit  = gscv_rf.fit(X, y)
 gscv_rf_fit_be_mod = gscv_rf_fit.best_params_
 gscv_rf_fit_be_res = gscv_rf_fit.cv_results_
 print('Best model:\n', gscv_rf_fit_be_mod)
 print('Best models score:\n', gscv_rf_fit.best_score_, ':' , round(gscv_rf_fit.best_score_, 2))
 print('\nMean test score from fit results:', round(mean(gscv_rf_fit_be_re['mean_test_mcc']),2))
 print('\nMean test score from fit results:', round(np.nanmean(gscv_rf_fit_be_res['mean_test_mcc']),2))
 ######################################
 # Blind test
 ######################################
 # See how it does on the BLIND test
 #print('\nBlind test score, mcc:', )
 test_predict = gscv_rf_fit.predict(X_bts)
 print(test_predict)
 print(np.array(y_bts))
 y_btsf = np.array(y_bts)
 print(accuracy_score(y_btsf, test_predict))
 print(matthews_corrcoef(y_btsf, test_predict))
 # create a dict with all scores
 rf_bts_dict = {#'best_model': list(gscv_rf_fit_be_mod.items())
               'bts_fscore'     : None
               , 'bts_mcc'      : None
               , 'bts_precision': None
               , 'bts_recall'   : None
               , 'bts_accuracy' : None
               , 'bts_roc_auc'  : None
               , 'bts_jaccard'  : None }
 rf_bts_dict
 rf_bts_dict['bts_fscore']    = round(f1_score(y_bts, test_predict),2)
 rf_bts_dict['bts_mcc']       = round(matthews_corrcoef(y_bts, test_predict),2)
 rf_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2)
 rf_bts_dict['bts_recall']    = round(recall_score(y_bts, test_predict),2)
 rf_bts_dict['bts_accuracy']  = round(accuracy_score(y_bts, test_predict),2)
 rf_bts_dict['bts_roc_auc']   = round(roc_auc_score(y_bts, test_predict),2)
 rf_bts_dict['bts_jaccard']   = round(jaccard_score(y_bts, test_predict),2)
 rf_bts_dict
 # Create a df from dict with all scores
 pd.DataFrame.from_dict(rf_bts_dict, orient = 'index', columns = 'best_model')
 rf_bts_df = pd.DataFrame.from_dict(rf_bts_dict,orient = 'index')
 rf_bts_df.columns = ['Logistic_Regression']
 print(rf_bts_df)
 # Create df with best model params
 model_params = pd.Series(['best_model_params',  list(gscv_rf_fit_be_mod.items() )])
 model_params_df = model_params.to_frame()
 model_params_df
 model_params_df.columns = ['Logistic_Regression']
 model_params_df.columns
 # Combine the df of scores and the best model params
 rf_bts_df.columns
 rf_output = pd.concat([model_params_df, rf_bts_df], axis = 0)
 rf_output
 # Format the combined df
 # Drop the best_model_params row from rf_output
 rf_df = rf_output.drop([0], axis = 0)
 rf_df
 #FIXME: tidy the index of the formatted df
 ###############################################################################
--- a/UQ_pnca_ML.py
+++ b/UQ_pnca_ML.py
@ -0,0 +1,287 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Sun Mar  6 13:41:54 2022
@author: tanu
 """
 import os, sys
 import pandas as pd
 import numpy as np
 import pprint as pp
 from copy import deepcopy
 from sklearn import linear_model
 from sklearn.linear_model import LogisticRegression, LinearRegression
 from sklearn.naive_bayes import BernoulliNB
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.svm import SVC
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
 from sklearn.ensemble import AdaBoostClassifier
 from sklearn.ensemble import GradientBoostingClassifier
 from sklearn.neural_network import MLPClassifier
 from xgboost import XGBClassifier
 from sklearn.naive_bayes import MultinomialNB
 from sklearn.linear_model import SGDClassifier
 from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
 from sklearn.compose import ColumnTransformer
 from sklearn.compose import make_column_transformer
 from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score 
 from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score
 from sklearn.metrics import jaccard_score
 from sklearn.metrics import make_scorer
 from sklearn.metrics import classification_report
 from sklearn.metrics import average_precision_score
 from sklearn.model_selection import cross_validate
 from sklearn.model_selection import train_test_split
 from sklearn.model_selection import StratifiedKFold
 from sklearn.pipeline import Pipeline
 from sklearn.pipeline import make_pipeline
 from sklearn.feature_selection import RFE
 from sklearn.feature_selection import RFECV
 import itertools
 #import seaborn as sns
 import matplotlib.pyplot as plt
 import numpy as np
 print(np.__version__)
 print(pd.__version__)
 from statistics import mean, stdev, median, mode
 #from imblearn.over_sampling import RandomOverSampler
 #from imblearn.over_sampling import SMOTE
 #from imblearn.pipeline import Pipeline
 #from sklearn.datasets import make_classification
 from sklearn.model_selection import cross_validate, cross_val_score
 from sklearn.model_selection import RepeatedStratifiedKFold
 from sklearn.ensemble import AdaBoostClassifier
 #from imblearn.combine import SMOTEENN
 #from imblearn.under_sampling import EditedNearestNeighbours
 from sklearn.model_selection import GridSearchCV
 from sklearn.base import BaseEstimator
 scoring_fn =  ({'accuracy'      : make_scorer(accuracy_score)
                 , 'fscore'     : make_scorer(f1_score)
                 , 'mcc'        : make_scorer(matthews_corrcoef)
                 ,  'precision' : make_scorer(precision_score)
                 ,  'recall'    : make_scorer(recall_score)
                 ,  'roc_auc'   : make_scorer(roc_auc_score)
                 , 'jcc'        : make_scorer(jaccard_score)
            }) 
 rs = {'random_state': 42}
 njobs = {'n_jobs': 10}
 skf_cv = StratifiedKFold(n_splits = 10
                          #, shuffle = False, random_state= None)
                           , shuffle = True,**rs)
 rskf_cv = RepeatedStratifiedKFold(n_splits = 10
                                  , n_repeats=3
                                 #, shuffle = False, random_state= None)
                                 #, shuffle = True
                                 ,**rs)
 #my_mcc = make_scorer({'mcc':make_scorer(matthews_corrcoef})
 mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)}
 #%%
 homedir = os.path.expanduser("~")
 os.chdir(homedir + "/git/ML_AI_training/")
 # my function
 #from MultClassPipe import MultClassPipeline
 from MultClassPipe2 import MultClassPipeline2
 from loopity_loop import MultClassPipeSKFLoop
 from MultClassPipe3 import MultClassPipeSKFCV
 gene = 'pncA'
 drug = 'pyrazinamide'
 #==============
 # directories
 #==============
 datadir = homedir + '/git/Data/'
 indir   = datadir + drug + '/input/'
 outdir  = datadir + drug + '/output/'
 #=======
 # input
 #=======
 infile_ml1 = outdir + gene.lower() + '_merged_df3.csv' 
 #infile_ml2 = outdir + gene.lower() + '_merged_df2.csv'
 my_df = pd.read_csv(infile_ml1, index_col = 0)
 my_df.dtypes
 my_df_cols = my_df.columns
 geneL_basic = ['pnca']
 # -- CHECK script -- imports.py
 #%% get cols
 mycols = my_df.columns
 mycols
 # change from numberic to 
 num_type = ['int64', 'float64']
 cat_type = ['object', 'bool']
 # TODO:
 # Treat active site aa pos as category and not numerical: This needs to be part of merged_df3!
 #if my_df['active_aa_pos'].dtype in num_type:
 #    my_df['active_aa_pos'] = my_df['active_aa_pos'].astype(object)
 #    my_df['active_aa_pos'].dtype
 # -- CHECK script -- imports.py
 #%%============================================================================
 #%% IMPUTE values for OR
 #%% Combine mmCSM_lig Data
 #%% Combine PROVEAN data
 #%% Combine ED logo data
 #%% Masking columns (mCSM-lig, mCSM-NA, mCSM-ppi2) values for lig_dist >10
 # get logic from upstream!
 my_df_ml = my_df.copy()
 my_df_ml['mutationinformation'][my_df['ligand_distance']>10].value_counts()
 my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts()
 my_df_ml.groupby(['mutationinformation'])['ligand_distance'].apply(lambda x: (x>10)).value_counts()
 my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), 'ligand_affinity_change'] = 0
 (my_df_ml['ligand_affinity_change'] == 0).sum()
 #%%============================================================================
 # Separate blind test set
 my_df_ml[drug].isna().sum()
 blind_test_df = my_df_ml[my_df_ml[drug].isna()]
 blind_test_df.shape
 training_df =  my_df_ml[my_df_ml[drug].notna()]
 training_df.shape
 # Target1: dst
 training_df[drug].value_counts()
 training_df['dst_mode'].value_counts()
 #%% Build X
 common_cols_stabiltyN = ['ligand_distance'
           , 'ligand_affinity_change'
           , 'duet_stability_change'
           , 'ddg_foldx'
           , 'deepddg'
           , 'ddg_dynamut2']
 foldX_cols = ['contacts'
 #, 'electro_rr', 'electro_mm', 'electro_sm', 'electro_ss'
 #, 'disulfide_rr', 'disulfide_mm', 'disulfide_sm', 'disulfide_ss'
 #, 'hbonds_rr', 'hbonds_mm', 'hbonds_sm', 'hbonds_ss'
 #, 'partcov_rr', 'partcov_mm', 'partcov_sm', 'partcov_ss'
 #, 'vdwclashes_rr', 'vdwclashes_mm', 'vdwclashes_sm', 'vdwclashes_ss'
 #, 'volumetric_rr', 'volumetric_mm', 'volumetric_ss'
 ]
 X_strFN =  ['rsa'
           #, 'asa'
           , 'kd_values'
           , 'rd_values']    
 X_evolFN =  ['consurf_score'
           , 'snap2_score']
 # quick inspection which lineage to use:
 #foo = my_df_ml[['lineage', 'lineage_count_all', 'lineage_count_unique']]
 X_genomicFN =  ['maf'
 #            , 'or_mychisq'
 #            , 'or_logistic'
 #            , 'or_fisher'
 #            , 'pval_fisher'
             #, 'lineage'
             , 'lineage_count_all'
             , 'lineage_count_unique'
 ]
 #%% Construct numerical and categorical column names
 # numerical feature names
 numerical_FN = common_cols_stabiltyN + foldX_cols + X_strFN + X_evolFN + X_genomicFN 
 #categorical feature names
 categorical_FN = ['ss_class'
             , 'wt_prop_water'
            # , 'lineage_labels' # misleading if using merged_df3
             , 'mut_prop_water'
             , 'wt_prop_polarity'
             , 'mut_prop_polarity'
             , 'wt_calcprop'
             , 'mut_calcprop'
             #, 'active_aa_pos'
             ]
 #%% extracting dfs based on numerical, categorical column names
 #----------------------------------
 # WITHOUT the target var included
 #----------------------------------
 num_df = training_df[numerical_FN]
 num_df.shape
 cat_df = training_df[categorical_FN]
 cat_df.shape
 all_df = training_df[numerical_FN + categorical_FN]
 all_df.shape
 #------------------------------
 # WITH the target var included:
    #'wtgt': with target
 #------------------------------
 # drug and dst_mode should be the same thing
 num_df_wtgt = training_df[numerical_FN + ['dst_mode']]
 num_df_wtgt.shape
 cat_df_wtgt = training_df[categorical_FN + ['dst_mode']]
 cat_df_wtgt.shape
 all_df_wtgt = training_df[numerical_FN + categorical_FN + ['dst_mode']]
 all_df_wtgt.shape
 #%%================================================================
 #%% Apply ML
 #TODO: Apply oversampling!
 #%% Data
 #X = all_df_wtgt[numerical_FN+categorical_FN]
 X = all_df_wtgt[numerical_FN]
 y = all_df_wtgt['dst_mode']
 #Blind test data {same format}
 X_bts = blind_test_df[numerical_FN]
 y_bts = blind_test_df['dst_mode']
 X_bts_wt = blind_test_df[numerical_FN + ['dst_mode']] 
 # Quick check
 (X['ligand_affinity_change']==0).sum() == (X['ligand_distance']>10).sum()
 #%% MultClassPipeSKFCV: function call()
 mm_skf_scoresD = MultClassPipeSKFCV(input_df = X
                                        , target = y
                                        , var_type = 'numerical'
                                        , skf_cv = skf_cv)
 mm_skf_scores_df_all = pd.DataFrame(mm_skf_scoresD)
 mm_skf_scores_df_all
 mm_skf_scores_df_test = mm_skf_scores_df_all.filter(like='test_', axis=0)
 mm_skf_scores_df_train = mm_skf_scores_df_all.filter(like='train_', axis=0) # helps to see if you trust the results
 print(mm_skf_scores_df_train)
 print(mm_skf_scores_df_test)