working on dissected model, testing diff feature groups

2022-06-20 21:51:07 +01:00 · 2022-06-20 21:51:07 +01:00 · e68a153883
commit e68a153883
parent 135efcee41
4 changed files with 270 additions and 161 deletions
--- a/scripts/ml/MultModelsCl.py
+++ b/scripts/ml/MultModelsCl.py
@ -74,11 +74,11 @@ import json
 rs = {'random_state': 42}
 njobs = {'n_jobs': 10}
-scoring_fn =  ({ 'mcc'        : make_scorer(matthews_corrcoef)
+scoring_fn =  ({ 'mcc'         : make_scorer(matthews_corrcoef)
                , 'accuracy'      : make_scorer(accuracy_score)
                , 'fscore'     : make_scorer(f1_score)
                , 'precision'  : make_scorer(precision_score)
                , 'recall'     : make_scorer(recall_score)
                , 'accuracy'   : make_scorer(accuracy_score)
                , 'roc_auc'    : make_scorer(roc_auc_score)
                , 'jcc'        : make_scorer(jaccard_score)
            }) 
@ -137,7 +137,9 @@ def MultModelsCl(input_df, target, skf_cv
    col_transform = ColumnTransformer(transformers = t
                                       , remainder='passthrough')
    #======================================================
    # Specify multiple Classification models  
    #======================================================
    models = [('Logistic Regression'       , LogisticRegression(**rs) )
            , ('Logistic RegressionCV'     , LogisticRegressionCV(**rs) )
            , ('Gaussian NB'               , GaussianNB() )
--- a/scripts/ml/MultModelsCl_dissected.py
+++ b/scripts/ml/MultModelsCl_dissected.py
@ -78,10 +78,10 @@ rs = {'random_state': 42}
 njobs = {'n_jobs': 10}
 scoring_fn =  ({ 'mcc'        : make_scorer(matthews_corrcoef)
                , 'accuracy'  : make_scorer(accuracy_score)
                , 'fscore'    : make_scorer(f1_score)
                , 'precision' : make_scorer(precision_score)
                , 'recall'    : make_scorer(recall_score)
                , 'accuracy'  : make_scorer(accuracy_score)
                , 'roc_auc'   : make_scorer(roc_auc_score)
                , 'jcc'       : make_scorer(jaccard_score)
            }) 
@ -103,7 +103,6 @@ def MultModelsCl_dissected(input_df, target, skf_cv
                       , blind_test_target
                       , add_cm = True # adds confusion matrix based on cross_val_predict
                       , add_yn = True  # adds target var class numbers
                       , feature_groups = ['']
                       , var_type = ['numerical', 'categorical','mixed']):
    '''
@ -123,13 +122,17 @@ def MultModelsCl_dissected(input_df, target, skf_cv
    Dict containing multiple classification scores for each model and mean of each Stratified Kfold including training
    '''
    #======================================================
    # Determine categorical and numerical features
    #======================================================
    numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
    numerical_ix
    categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns
    categorical_ix    
    #======================================================
    # Determine preprocessing steps ~ var_type
    #======================================================
    if var_type == 'numerical':
        t = [('num', MinMaxScaler(), numerical_ix)]
@ -143,7 +146,9 @@ def MultModelsCl_dissected(input_df, target, skf_cv
    col_transform = ColumnTransformer(transformers = t
                                       , remainder='passthrough')
-    # Specify multiple Classification models  
+    #======================================================
    # Specify multiple Classification Models  
    #======================================================
    models = [('Logistic Regression'       , LogisticRegression(**rs) )
            , ('Logistic RegressionCV'     , LogisticRegressionCV(**rs) )
            , ('Gaussian NB'               , GaussianNB() )
@ -206,7 +211,7 @@ def MultModelsCl_dissected(input_df, target, skf_cv
        #######################################################################
        #======================================================
-        # Option 1: Add confusion matrix from cross_val_predict
+        # Option: Add confusion matrix from cross_val_predict
        # Understand and USE with caution
        # cross_val_score, cross_val_predict, "Passing these predictions into an evaluation metric may not be a valid way to measure generalization performance. Results can differ from cross_validate and cross_val_score unless all tests sets have equal size and the metric decomposes over samples."
        # https://stackoverflow.com/questions/65645125/producing-a-confusion-matrix-with-cross-validate
@ -237,7 +242,7 @@ def MultModelsCl_dissected(input_df, target, skf_cv
            skf_cv_modD = skf_cv_modD
        #######################################################################            
        #=============================================
-        # Option 2: Add targety numbers for data
+        # Option: Add targety numbers for data
        #=============================================
        if add_yn:    
--- a/scripts/ml/ml_data_dissected.py
+++ b/scripts/ml/ml_data_dissected.py
@ -417,125 +417,37 @@ else:
 #---------------------------------------
 #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-#%%########################################################################
+#%% Data for ML ###############################################################
 #==========================
 #     Data for ML
 #==========================
 my_df_ml = my_df.copy()
-#%% Build X: input for ML
+# Build column names to mask for affinity chanhes
 common_cols_stabiltyN = ['ligand_distance'
           , 'ligand_affinity_change'
           , 'duet_stability_change'
           , 'ddg_foldx'
           , 'deepddg'
           , 'ddg_dynamut2'
           , 'mmcsm_lig'
           , 'contacts']
 # Build stability columns ~ gene
 if gene.lower() in geneL_basic:
-    X_stabilityN = common_cols_stabiltyN
+    #X_stabilityN = common_cols_stabiltyN
    gene_affinity_colnames = []# not needed as its a common one
    cols_to_mask = ['ligand_affinity_change']
 if gene.lower() in geneL_ppi2:
-#    X_stabilityN = common_cols_stabiltyN + ['mcsm_ppi2_affinity' , 'interface_dist'] 
+    gene_affinity_colnames = ['mcsm_ppi2_affinity', 'interface_dist'] 
-    geneL_ppi2_st_cols = ['mcsm_ppi2_affinity', 'interface_dist'] 
+    #X_stabilityN = common_cols_stabiltyN + geneL_ppi2_st_cols
    X_stabilityN = common_cols_stabiltyN + geneL_ppi2_st_cols
    cols_to_mask = ['ligand_affinity_change', 'mcsm_ppi2_affinity']
 if gene.lower() in geneL_na:
-#    X_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] 
+    gene_affinity_colnames =  ['mcsm_na_affinity'] 
-    geneL_na_st_cols =  ['mcsm_na_affinity'] 
+    #X_stabilityN = common_cols_stabiltyN + geneL_na_st_cols
    X_stabilityN = common_cols_stabiltyN + geneL_na_st_cols
    cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity']
 if gene.lower() in geneL_na_ppi2:
-#    X_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] 
+    gene_affinity_colnames = ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] 
-    geneL_na_ppi2_st_cols = ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] 
+    #X_stabilityN = common_cols_stabiltyN + geneL_na_ppi2_st_cols
    X_stabilityN = common_cols_stabiltyN + geneL_na_ppi2_st_cols
    cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity', 'mcsm_ppi2_affinity']
 X_foldX_cols = [ 'electro_rr', 'electro_mm', 'electro_sm', 'electro_ss'
 , 'disulfide_rr', 'disulfide_mm', 'disulfide_sm', 'disulfide_ss'
 , 'hbonds_rr', 'hbonds_mm', 'hbonds_sm', 'hbonds_ss'
 , 'partcov_rr', 'partcov_mm', 'partcov_sm', 'partcov_ss'
 , 'vdwclashes_rr', 'vdwclashes_mm', 'vdwclashes_sm', 'vdwclashes_ss'
 , 'volumetric_rr', 'volumetric_mm', 'volumetric_ss'
 ]
 X_str =  ['rsa'
           #, 'asa'
           , 'kd_values'
           , 'rd_values']    
 X_ssFN = X_stabilityN + X_str + X_foldX_cols
 X_evolFN =  ['consurf_score'
           , 'snap2_score'
           , 'provean_score']
 X_genomic_mafor =  ['maf'
                , 'logorI'
                # , 'or_rawI'
                # , 'or_mychisq'
                # , 'or_logistic'
                # , 'or_fisher'
                # , 'pval_fisher'
                ]
 X_genomic_linegae  = ['lineage_proportion'
                      , 'dist_lineage_proportion'
                      #, 'lineage' # could be included as a category but it has L2;L4  formatting
                      , 'lineage_count_all'
                      , 'lineage_count_unique'
                      ]
 X_genomicFN = X_genomic_mafor + X_genomic_linegae
 #X_aaindexFN = list(aa_df_cols)
 #print('\nTotal no. of features for aaindex:', len(X_aaindexFN))
 # numerical feature names [NO aa_index]
 numerical_FN = X_ssFN  + X_evolFN + X_genomicFN
 # categorical feature names
 categorical_FN = ['ss_class'
            # , 'wt_prop_water'
            # , 'mut_prop_water'
            # , 'wt_prop_polarity'
            # , 'mut_prop_polarity'
            # , 'wt_calcprop'
            # , 'mut_calcprop'
            , 'aa_prop_change'
            , 'electrostatics_change'
            , 'polarity_change'
            , 'water_change'
            , 'drtype_mode_labels' # beware then you can't use it to predict [USED it for uq_v1, not v2]
            , 'active_site' #[didn't use it for uq_v1]
            #, 'gene_name' # will be required for the combined stuff
             ]
 #----------------------------------------------
 # count numerical and categorical features
 #----------------------------------------------
 print('\nNo. of numerical features:', len(numerical_FN)
      , '\nNo. of categorical features:', len(categorical_FN))
 ###########################################################################
 #=======================
 # Masking columns:
 # (mCSM-lig, mCSM-NA, mCSM-ppi2) values for lig_dist >10
 #=======================
 # my_df_ml['mutationinformation'][my_df['ligand_distance']>10].value_counts()
 # my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts()
 # my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), 'ligand_affinity_change'] = 0
 # (my_df_ml['ligand_affinity_change'] == 0).sum()
 my_df_ml['mutationinformation'][my_df_ml['ligand_distance']>10].value_counts()
 my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts()
 my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask].value_counts()
@ -546,16 +458,139 @@ my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask] = 0
 mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask]  
 #===================================================
 # write file for check
 mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
 mask_check.to_csv(outdir + 'ml/' + gene.lower() + '_mask_check.csv')
 #===================================================
 ###############################################################################
 #%% Feature groups (FG): Build X for Input ML 
 ############################################################################
 #===========================
 # FG1: Evolutionary features
 #===========================
 X_evolFN =  ['consurf_score'
           , 'snap2_score'
           , 'provean_score']
 ###############################################################################
 #========================
 # FG2: Stability features
 #========================
 #--------
 # common
 #--------
 X_common_stability_Fnum = [
           'duet_stability_change'
           , 'ddg_foldx'
           , 'deepddg'
           , 'ddg_dynamut2'
           , 'mmcsm_lig'
           , 'contacts']
 #--------
 # FoldX
 #--------
 X_foldX_Fnum = [ 'electro_rr', 'electro_mm', 'electro_sm', 'electro_ss'
 , 'disulfide_rr', 'disulfide_mm', 'disulfide_sm', 'disulfide_ss'
 , 'hbonds_rr', 'hbonds_mm', 'hbonds_sm', 'hbonds_ss'
 , 'partcov_rr', 'partcov_mm', 'partcov_sm', 'partcov_ss'
 , 'vdwclashes_rr', 'vdwclashes_mm', 'vdwclashes_sm', 'vdwclashes_ss'
 , 'volumetric_rr', 'volumetric_mm', 'volumetric_ss']
 X_stability_FN = X_common_stability_Fnum + X_foldX_Fnum
 ###############################################################################
 #===================
 # FG3: Affinity features
 #===================
 common_affinity_Fnum =  ['ligand_distance'
                , 'ligand_affinity_change']
 # if gene.lower() in geneL_basic:
 #     X_affinityFN = common_affinity_Fnum 
 # else:
 #     X_affinityFN = common_affinity_Fnum + gene_affinity_colnames
 X_affinityFN = common_affinity_Fnum + gene_affinity_colnames
 ###############################################################################
 #============================
 # FG4: Residue level features
 #============================
 #-----------
 # AA index
 #-----------
 X_aaindex_Fnum = list(aa_df_cols)
 print('\nTotal no. of features for aaindex:', len(X_aaindex_Fnum))
 #-----------------
 # surface area
 # depth
 # hydrophobicity
 #-----------------
 X_str_Fnum =  ['rsa'
           #, 'asa'
           , 'kd_values'
           , 'rd_values']   
 #---------------------------
 # Other aa properties
 # active site indication
 #---------------------------
 X_aap_Fcat = ['ss_class'
            # , 'wt_prop_water'
            # , 'mut_prop_water'
            # , 'wt_prop_polarity'
            # , 'mut_prop_polarity'
            # , 'wt_calcprop'
            # , 'mut_calcprop'
            , 'aa_prop_change'
            , 'electrostatics_change'
            , 'polarity_change'
            , 'water_change'
            , 'active_site']
 X_resprop_FN = X_aaindex_Fnum + X_str_Fnum + X_aap_Fcat
 ###############################################################################
 #========================
 # FG5: Genomic features
 #========================
 X_gn_mafor_Fnum =  ['maf'
                , 'logorI'
                # , 'or_rawI'
                # , 'or_mychisq'
                # , 'or_logistic'
                # , 'or_fisher'
                # , 'pval_fisher'
                ]
 X_gn_linegae_Fnum  = ['lineage_proportion'
                      , 'dist_lineage_proportion'
                      #, 'lineage' # could be included as a category but it has L2;L4  formatting
                      , 'lineage_count_all'
                      , 'lineage_count_unique'
                      ]
 X_gn_Fcat = ['drtype_mode_labels'  # beware then you can't use it to predict [USED it for uq_v1, not v2]
               #, 'gene_name' # will be required for the combined stuff
             ]
 X_genomicFN = X_gn_mafor_Fnum + X_gn_linegae_Fnum + X_gn_Fcat
 ###############################################################################
 # Feature groups further collaps:
 X_structural_FN =  X_stability_FN + X_affinityFN + X_resprop_FN
 all_featuresN = X_evolFN + X_structural_FN + X_genomicFN
 ###############################################################################
 #%% Define training and test data
 #======================================================
 # Training and BLIND test set [UQ]: actual vs imputed
 # No aa index but active_site included
 # dst with actual values  : training set
 # dst with imputed values : blind test
-#==================================================
+#======================================================
 my_df_ml[drug].isna().sum()  #'na' ones are the blind_test set
 blind_test_df = my_df_ml[my_df_ml[drug].isna()]
@ -567,6 +602,7 @@ training_df.shape
 # Target 1: dst_mode
 training_df[drug].value_counts()
 training_df['dst_mode'].value_counts()
 ####################################################################
 #============
 # ML data
@ -574,8 +610,8 @@ training_df['dst_mode'].value_counts()
 #------
 # X: Training and Blind test (BTS)
 #------
-X     = training_df[numerical_FN + categorical_FN] 
+X     = training_df[all_featuresN] 
-X_bts = blind_test_df[numerical_FN + categorical_FN] 
+X_bts = blind_test_df[all_featuresN] 
 #------
 # y
@ -601,19 +637,67 @@ yc1_ratio = yc1[0]/yc1[1]
 yc2 = Counter(y_bts)
 yc2_ratio = yc2[0]/yc2[1]
 ###############################################################################
 #======================================================
 # Determine categorical and numerical features
 #======================================================
 numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
 numerical_cols 
 categorical_cols = X.select_dtypes(include=['object', 'bool']).columns
 categorical_cols 
 ################################################################################
 # IMPORTANT sanity checks
 if len(X.columns) == len(X_evolFN) + len(X_stability_FN) + len(X_affinityFN) + len(X_resprop_FN) + len(X_genomicFN):
    print('\nPASS: ML data with input features, training and test generated...'
          , '\n\nTotal no. of input features:'        , len(X.columns)
          , '\n--------No. of numerical features:'    , len(numerical_cols)
          , '\n--------No. of categorical features:'  , len(categorical_cols)
          , '\n\nTotal no. of evolutionary features:' , len(X_evolFN)
          , '\n\nTotal no. of stability features:'    , len(X_stability_FN)
          , '\n--------Common stabilty cols:'         , len(X_common_stability_Fnum)
          , '\n--------Foldx cols:'                   , len(X_foldX_Fnum)
          , '\n\nTotal no. of affinity features:'     , len(X_affinityFN)
          , '\n--------Common affinity cols:'         , len(common_affinity_Fnum)
          , '\n--------Gene specific affinity cols:'     , len(gene_affinity_colnames)
          , '\n\nTotal no. of residue level features:', len(X_resprop_FN)
          , '\n--------AA index cols:'                , len(X_aaindex_Fnum)
          , '\n--------Residue Prop cols:'            , len(X_str_Fnum)
          , '\n--------AA change Prop cols:'          , len(X_aap_Fcat)
          , '\n\nTotal no. of genomic features:'      , len(X_genomicFN)
          , '\n--------MAF+OR cols:'                  , len(X_gn_mafor_Fnum)
          , '\n--------Lineage cols:'                 , len(X_gn_linegae_Fnum)
          , '\n--------Other cols:'                   , len(X_gn_Fcat)
          )
 else:
    print('\nFAIL: numbers mismatch'
          , '\nExpected:',len(X_evolFN) + len(X_stability_FN) + len(X_affinityFN) + len(X_resprop_FN) + len(X_genomicFN)
          , '\nGot:', len(X.columns))
    sys.exit()
 ###############################################################################
 print('\n-------------------------------------------------------------'
-      , '\nSuccessfully split data: UQ [no aa_index but active site included] training'
+      , '\nSuccessfully split data: ALL features'
      , '\nactual values: training set'
      , '\nimputed values: blind test set'
-      , '\nTrain data size:', X.shape
+      
-      , '\nTest data size:', X_bts.shape
+      , '\n\nTotal data size:', len(X) + len(X_bts)
      , '\n\nTrain data size:', X.shape
      , '\ny_train numbers:', yc1
-      , '\ny_train ratio:',yc1_ratio
+
-      , '\n'
+      , '\n\nTest data size:', X_bts.shape
      , '\ny_test_numbers:', yc2
      , '\n\ny_train ratio:',yc1_ratio
      , '\ny_test ratio:', yc2_ratio
      , '\n-------------------------------------------------------------'
      )
 ###########################################################################
 #%% 
 ###########################################################################
--- a/scripts/ml/pnca_config_dissected.py
+++ b/scripts/ml/pnca_config_dissected.py
@ -47,60 +47,78 @@ outdir_ml = outdir + 'ml/uq_v1/dissected'
 print('\nOutput directory:', outdir_ml)
 #%%###########################################################################
 print('\nSanity checks:'
      , '\nTotal input features:', len(X.columns)
      , '\n'
      , '\nTraining data size:', X.shape
      , '\nTest data size:', X_bts.shape
      , '\n'
      , '\nTarget feature numbers (training data):', Counter(y)
      , '\nTarget features ratio (training data:', yc1_ratio
      , '\n'
      , '\nTarget feature numbers (test data):', Counter(y_bts)
      , '\nTarget features ratio (test data):', yc2_ratio
      , '\n\n#####################################################################\n')
 print('\n================================================================\n')
-print('Strucutral features (n):'
+          , '\n\nTotal no. of evolutionary features:' , len(X_evolFN)
      , len(X_ssFN)
      , '\nThese are:'
      , '\nCommon stablity features:', X_stabilityN
      , '\nFoldX columns:', X_foldX_cols
      , '\nOther struc columns:', X_str
      , '\n================================================================\n')
-# print('AAindex features (n):'
+          , '\n\nTotal no. of stability features:'    , len(X_stability_FN)
-#       , len(X_aaindexFN)
+          , '\n--------Common stabilty cols:'         , len(X_common_stability_Fnum)
-#       , '\nThese are:\n'
+          , '\n--------Foldx cols:'                   , len(X_foldX_Fnum)
 #       , X_aaindexFN
 #       , '\n================================================================\n')
-print('Evolutionary features (n):'
+          , '\n\nTotal no. of affinity features:'     , len(X_affinityFN)
-      , len(X_evolFN)
+          , '\n--------Common affinity cols:'         , len(common_affinity_Fnum)
-      , '\nThese are:\n'
+          , '\n--------Gene specific affinity cols:'  , len(gene_affinity_colnames)
      , X_evolFN
      , '\n================================================================\n')
-print('Genomic features (n):'
+          , '\n\nTotal no. of residue level features:', len(X_resprop_FN)
-      , len(X_genomicFN)
+          , '\n--------AA index cols:'                , len(X_aaindex_Fnum)
-      , '\nThese are:\n'
+          , '\n--------Residue Prop cols:'            , len(X_str_Fnum)
-      , X_genomic_mafor, '\n'
+          , '\n--------AA change Prop cols:'          , len(X_aap_Fcat)
      , X_genomic_linegae
      , '\n================================================================\n')
-print('Categorical features (n):'
+          , '\n\nTotal no. of genomic features:'      , len(X_genomicFN)
-      , len(categorical_FN)
+          , '\n--------MAF+OR cols:'                  , len(X_gn_mafor_Fnum)
-      , '\nThese are:\n'
+          , '\n--------Lineage cols:'                 , len(X_gn_linegae_Fnum)
-      , categorical_FN
+          , '\n--------Other cols:'                   , len(X_gn_Fcat)
      , '\n================================================================\n')
-#if ( len(X.columns) ==  len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
+X_structural_FN =  X_stability_FN + X_affinityFN + X_resprop_FN
-if ( len(X.columns) ==  len(X_ssFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
+ X_aaindex_Fnum + X_str_Fnum + X_aap_Fcat
 all_featuresN = X_evolFN + X_structural_FN + X_genomicFN
 ###############################################################################
 print('\n================================================================'
      , '\nTotal Evolutionary features (n):' , len(X_evolFN)
      , '\n--------------Evol. feature colnames:', X_evolFN
      , '\n================================================================'
      , '\n\nTotal structural features (n):', len(X_structural_FN)
      , '\n--------Stability ncols:'                      , len(X_stability_FN)
      , '\n--------------Common stability colnames:'      , X_common_stability_Fnum
      , '\n--------------Foldx colnames:'                 , X_foldX_Fnum
      , '\n--------Affinity ncols:'                       , len(X_affinityFN)
      , '\n--------------Common affinity colnames:'       , common_affinity_Fnum
      , '\n--------------Gene specific affinity colnames:', gene_affinity_colnames
      , '\n--------Residue prop ncols:'                   , len(X_resprop_FN)
      , '\n--------------Residue Prop cols:'              , X_str_Fnum
      , '\n--------------AA change Prop cols:'            , X_aap_Fcat
      , '\n--------------AA index cols:'                  , X_aaindex_Fnum
      , '\n================================================================'
      , '\n\nTotal Genomic features (n):'   , len(X_genomicFN)
      , '\n--------MAF+OR cols:'                         , len(X_gn_mafor_Fnum)
      , '\n--------------MAF+OR colnames:'               , X_gn_mafor_Fnum
      , '\n--------Lineage cols:'                        , len(X_gn_linegae_Fnum)
      , '\n--------------Lineage cols:'                  , X_gn_linegae_Fnum
      , '\n--------Other cols:'                          , len(X_gn_Fcat)
      , '\n--------------Other cols:'                    , X_gn_Fcat
      , '\n================================================================')
 # Sanity check
 if ( len(X.columns) ==  len(X_evolFN) + len(X_structural_FN) + len(X_genomicFN)):
    print('\nPass: No. of features match')
 else:
-    sys.exit('\nFail: Count of feature mismatch')
+    print('\nFail: Count of feature mismatch'
          , '\nExpected:', len(X_evolFN) + len(X_structural_FN) + len(X_genomicFN)
          , '\nGot:', len(X.columns))
    sys.exit()
 print('\n#####################################################################\n')
@ -108,7 +126,7 @@ print('\n#####################################################################\n
 # #==================
 # # Baseline models 
 # #==================
-# mm_skf_scoresD = MultModelsCl(input_df = X
+# mm_skf_scoresD = MultModelsCl_dissected(input_df = X
 #                                         , target = y
 #                                         , var_type = 'mixed'
 #                                         , skf_cv = skf_cv