From e68a153883637eb2f17d92670fd09bb4ed850975 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Mon, 20 Jun 2022 21:51:07 +0100
Subject: [PATCH] working on dissected model, testing diff feature groups

---
 scripts/ml/MultModelsCl.py           |   6 +-
 scripts/ml/MultModelsCl_dissected.py |  17 +-
 scripts/ml/ml_data_dissected.py      | 298 +++++++++++++++++----------
 scripts/ml/pnca_config_dissected.py  | 110 +++++-----
 4 files changed, 270 insertions(+), 161 deletions(-)

diff --git a/scripts/ml/MultModelsCl.py b/scripts/ml/MultModelsCl.py
index 6ed37cd..74e2482 100755
--- a/scripts/ml/MultModelsCl.py
+++ b/scripts/ml/MultModelsCl.py
@@ -74,11 +74,11 @@ import json
 rs = {'random_state': 42}
 njobs = {'n_jobs': 10}
 
-scoring_fn =  ({ 'mcc'        : make_scorer(matthews_corrcoef)
-                , 'accuracy'      : make_scorer(accuracy_score)
+scoring_fn =  ({ 'mcc'         : make_scorer(matthews_corrcoef)
                 , 'fscore'     : make_scorer(f1_score)
                 , 'precision'  : make_scorer(precision_score)
                 , 'recall'     : make_scorer(recall_score)
+                , 'accuracy'   : make_scorer(accuracy_score)
                 , 'roc_auc'    : make_scorer(roc_auc_score)
                 , 'jcc'        : make_scorer(jaccard_score)
             }) 
@@ -137,7 +137,9 @@ def MultModelsCl(input_df, target, skf_cv
     col_transform = ColumnTransformer(transformers = t
                                        , remainder='passthrough')
     
+    #======================================================
     # Specify multiple Classification models  
+    #======================================================
     models = [('Logistic Regression'       , LogisticRegression(**rs) )
             , ('Logistic RegressionCV'     , LogisticRegressionCV(**rs) )
             , ('Gaussian NB'               , GaussianNB() )
diff --git a/scripts/ml/MultModelsCl_dissected.py b/scripts/ml/MultModelsCl_dissected.py
index cabef15..6919061 100644
--- a/scripts/ml/MultModelsCl_dissected.py
+++ b/scripts/ml/MultModelsCl_dissected.py
@@ -78,10 +78,10 @@ rs = {'random_state': 42}
 njobs = {'n_jobs': 10}
 
 scoring_fn =  ({ 'mcc'        : make_scorer(matthews_corrcoef)
-                , 'accuracy'  : make_scorer(accuracy_score)
                 , 'fscore'    : make_scorer(f1_score)
                 , 'precision' : make_scorer(precision_score)
                 , 'recall'    : make_scorer(recall_score)
+                , 'accuracy'  : make_scorer(accuracy_score)
                 , 'roc_auc'   : make_scorer(roc_auc_score)
                 , 'jcc'       : make_scorer(jaccard_score)
             }) 
@@ -103,7 +103,6 @@ def MultModelsCl_dissected(input_df, target, skf_cv
                        , blind_test_target
                        , add_cm = True # adds confusion matrix based on cross_val_predict
                        , add_yn = True  # adds target var class numbers
-                       , feature_groups = ['']
                        , var_type = ['numerical', 'categorical','mixed']):
 
     '''
@@ -122,14 +121,18 @@ def MultModelsCl_dissected(input_df, target, skf_cv
     returns
     Dict containing multiple classification scores for each model and mean of each Stratified Kfold including training
     '''
-    
+
+    #======================================================
     # Determine categorical and numerical features
+    #======================================================
     numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
     numerical_ix
     categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns
     categorical_ix    
 
+    #======================================================
     # Determine preprocessing steps ~ var_type
+    #======================================================
     if var_type == 'numerical':
         t = [('num', MinMaxScaler(), numerical_ix)]
 
@@ -143,7 +146,9 @@ def MultModelsCl_dissected(input_df, target, skf_cv
     col_transform = ColumnTransformer(transformers = t
                                        , remainder='passthrough')
     
-    # Specify multiple Classification models  
+    #======================================================
+    # Specify multiple Classification Models  
+    #======================================================
     models = [('Logistic Regression'       , LogisticRegression(**rs) )
             , ('Logistic RegressionCV'     , LogisticRegressionCV(**rs) )
             , ('Gaussian NB'               , GaussianNB() )
@@ -206,7 +211,7 @@ def MultModelsCl_dissected(input_df, target, skf_cv
         
         #######################################################################
         #======================================================
-        # Option 1: Add confusion matrix from cross_val_predict
+        # Option: Add confusion matrix from cross_val_predict
         # Understand and USE with caution
         # cross_val_score, cross_val_predict, "Passing these predictions into an evaluation metric may not be a valid way to measure generalization performance. Results can differ from cross_validate and cross_val_score unless all tests sets have equal size and the metric decomposes over samples."
         # https://stackoverflow.com/questions/65645125/producing-a-confusion-matrix-with-cross-validate
@@ -237,7 +242,7 @@ def MultModelsCl_dissected(input_df, target, skf_cv
             skf_cv_modD = skf_cv_modD
         #######################################################################            
         #=============================================
-        # Option 2: Add targety numbers for data
+        # Option: Add targety numbers for data
         #=============================================
         if add_yn:    
             
diff --git a/scripts/ml/ml_data_dissected.py b/scripts/ml/ml_data_dissected.py
index 4bd588c..12ea9b1 100644
--- a/scripts/ml/ml_data_dissected.py
+++ b/scripts/ml/ml_data_dissected.py
@@ -417,125 +417,37 @@ else:
 #---------------------------------------
 #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 
-#%%########################################################################
+#%% Data for ML ###############################################################
 #==========================
 #     Data for ML
 #==========================
 my_df_ml = my_df.copy()
 
-#%% Build X: input for ML
-common_cols_stabiltyN = ['ligand_distance'
-           , 'ligand_affinity_change'
-           , 'duet_stability_change'
-           , 'ddg_foldx'
-           , 'deepddg'
-           , 'ddg_dynamut2'
-           , 'mmcsm_lig'
-           , 'contacts']
-
-# Build stability columns ~ gene
+# Build column names to mask for affinity chanhes
 if gene.lower() in geneL_basic:
-    X_stabilityN = common_cols_stabiltyN
+    #X_stabilityN = common_cols_stabiltyN
+    gene_affinity_colnames = []# not needed as its a common one
     cols_to_mask = ['ligand_affinity_change']
     
 if gene.lower() in geneL_ppi2:
-#    X_stabilityN = common_cols_stabiltyN + ['mcsm_ppi2_affinity' , 'interface_dist'] 
-    geneL_ppi2_st_cols = ['mcsm_ppi2_affinity', 'interface_dist'] 
-    X_stabilityN = common_cols_stabiltyN + geneL_ppi2_st_cols
+    gene_affinity_colnames = ['mcsm_ppi2_affinity', 'interface_dist'] 
+    #X_stabilityN = common_cols_stabiltyN + geneL_ppi2_st_cols
     cols_to_mask = ['ligand_affinity_change', 'mcsm_ppi2_affinity']
 
 if gene.lower() in geneL_na:
-#    X_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] 
-    geneL_na_st_cols =  ['mcsm_na_affinity'] 
-    X_stabilityN = common_cols_stabiltyN + geneL_na_st_cols
+    gene_affinity_colnames =  ['mcsm_na_affinity'] 
+    #X_stabilityN = common_cols_stabiltyN + geneL_na_st_cols
     cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity']
 
 if gene.lower() in geneL_na_ppi2:
-#    X_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] 
-    geneL_na_ppi2_st_cols = ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] 
-    X_stabilityN = common_cols_stabiltyN + geneL_na_ppi2_st_cols
+    gene_affinity_colnames = ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] 
+    #X_stabilityN = common_cols_stabiltyN + geneL_na_ppi2_st_cols
     cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity', 'mcsm_ppi2_affinity']
 
-
-X_foldX_cols = [ 'electro_rr', 'electro_mm', 'electro_sm', 'electro_ss'
-, 'disulfide_rr', 'disulfide_mm', 'disulfide_sm', 'disulfide_ss'
-, 'hbonds_rr', 'hbonds_mm', 'hbonds_sm', 'hbonds_ss'
-, 'partcov_rr', 'partcov_mm', 'partcov_sm', 'partcov_ss'
-, 'vdwclashes_rr', 'vdwclashes_mm', 'vdwclashes_sm', 'vdwclashes_ss'
-, 'volumetric_rr', 'volumetric_mm', 'volumetric_ss'
-]
-
-X_str =  ['rsa'
-           #, 'asa'
-           , 'kd_values'
-           , 'rd_values']    
-
-X_ssFN = X_stabilityN + X_str + X_foldX_cols
-
-X_evolFN =  ['consurf_score'
-           , 'snap2_score'
-           , 'provean_score']
-    
-X_genomic_mafor =  ['maf'
-                , 'logorI'
-                # , 'or_rawI'
-                # , 'or_mychisq'
-                # , 'or_logistic'
-                # , 'or_fisher'
-                # , 'pval_fisher'
-                ]
-
-X_genomic_linegae  = ['lineage_proportion'
-                      , 'dist_lineage_proportion'
-                      #, 'lineage' # could be included as a category but it has L2;L4  formatting
-                      , 'lineage_count_all'
-                      , 'lineage_count_unique'
-                      ]
-
-X_genomicFN = X_genomic_mafor + X_genomic_linegae
-
-#X_aaindexFN = list(aa_df_cols)
-
-#print('\nTotal no. of features for aaindex:', len(X_aaindexFN))
-
-# numerical feature names [NO aa_index]
-numerical_FN = X_ssFN  + X_evolFN + X_genomicFN
-
-
-# categorical feature names
-categorical_FN = ['ss_class'
-            # , 'wt_prop_water'
-            # , 'mut_prop_water'
-            # , 'wt_prop_polarity'
-            # , 'mut_prop_polarity'
-            # , 'wt_calcprop'
-            # , 'mut_calcprop'
-            , 'aa_prop_change'
-            , 'electrostatics_change'
-            , 'polarity_change'
-            , 'water_change'
-            , 'drtype_mode_labels' # beware then you can't use it to predict [USED it for uq_v1, not v2]
-            , 'active_site' #[didn't use it for uq_v1]
-            #, 'gene_name' # will be required for the combined stuff
-             ]
-#----------------------------------------------
-# count numerical and categorical features
-#----------------------------------------------
-
-print('\nNo. of numerical features:', len(numerical_FN)
-      , '\nNo. of categorical features:', len(categorical_FN))
-
-###########################################################################
 #=======================
 # Masking columns:
 # (mCSM-lig, mCSM-NA, mCSM-ppi2) values for lig_dist >10
 #=======================
-# my_df_ml['mutationinformation'][my_df['ligand_distance']>10].value_counts()
-# my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts()
-
-# my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), 'ligand_affinity_change'] = 0
-# (my_df_ml['ligand_affinity_change'] == 0).sum()
-
 my_df_ml['mutationinformation'][my_df_ml['ligand_distance']>10].value_counts()
 my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts()
 my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask].value_counts()
@@ -546,16 +458,139 @@ my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask] = 0
 
 mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask]  
 
+#===================================================
 # write file for check
 mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
 mask_check.to_csv(outdir + 'ml/' + gene.lower() + '_mask_check.csv')
-
 #===================================================
+###############################################################################
+#%% Feature groups (FG): Build X for Input ML 
+############################################################################
+#===========================
+# FG1: Evolutionary features
+#===========================
+X_evolFN =  ['consurf_score'
+           , 'snap2_score'
+           , 'provean_score']
+
+###############################################################################
+#========================
+# FG2: Stability features
+#========================
+#--------
+# common
+#--------
+X_common_stability_Fnum = [
+           'duet_stability_change'
+           , 'ddg_foldx'
+           , 'deepddg'
+           , 'ddg_dynamut2'
+           , 'mmcsm_lig'
+           , 'contacts']
+#--------
+# FoldX
+#--------
+X_foldX_Fnum = [ 'electro_rr', 'electro_mm', 'electro_sm', 'electro_ss'
+, 'disulfide_rr', 'disulfide_mm', 'disulfide_sm', 'disulfide_ss'
+, 'hbonds_rr', 'hbonds_mm', 'hbonds_sm', 'hbonds_ss'
+, 'partcov_rr', 'partcov_mm', 'partcov_sm', 'partcov_ss'
+, 'vdwclashes_rr', 'vdwclashes_mm', 'vdwclashes_sm', 'vdwclashes_ss'
+, 'volumetric_rr', 'volumetric_mm', 'volumetric_ss']
+
+X_stability_FN = X_common_stability_Fnum + X_foldX_Fnum
+
+###############################################################################
+#===================
+# FG3: Affinity features
+#===================
+common_affinity_Fnum =  ['ligand_distance'
+                , 'ligand_affinity_change']
+
+# if gene.lower() in geneL_basic:
+#     X_affinityFN = common_affinity_Fnum 
+# else:
+#     X_affinityFN = common_affinity_Fnum + gene_affinity_colnames
+    
+X_affinityFN = common_affinity_Fnum + gene_affinity_colnames
+
+###############################################################################
+#============================
+# FG4: Residue level features
+#============================
+#-----------
+# AA index
+#-----------
+X_aaindex_Fnum = list(aa_df_cols)
+print('\nTotal no. of features for aaindex:', len(X_aaindex_Fnum))
+
+#-----------------
+# surface area
+# depth
+# hydrophobicity
+#-----------------
+X_str_Fnum =  ['rsa'
+           #, 'asa'
+           , 'kd_values'
+           , 'rd_values']   
+
+#---------------------------
+# Other aa properties
+# active site indication
+#---------------------------
+X_aap_Fcat = ['ss_class'
+            # , 'wt_prop_water'
+            # , 'mut_prop_water'
+            # , 'wt_prop_polarity'
+            # , 'mut_prop_polarity'
+            # , 'wt_calcprop'
+            # , 'mut_calcprop'
+            , 'aa_prop_change'
+            , 'electrostatics_change'
+            , 'polarity_change'
+            , 'water_change'
+            , 'active_site']
+
+
+X_resprop_FN = X_aaindex_Fnum + X_str_Fnum + X_aap_Fcat
+###############################################################################
+#========================
+# FG5: Genomic features
+#========================
+X_gn_mafor_Fnum =  ['maf'
+                , 'logorI'
+                # , 'or_rawI'
+                # , 'or_mychisq'
+                # , 'or_logistic'
+                # , 'or_fisher'
+                # , 'pval_fisher'
+                ]
+
+X_gn_linegae_Fnum  = ['lineage_proportion'
+                      , 'dist_lineage_proportion'
+                      #, 'lineage' # could be included as a category but it has L2;L4  formatting
+                      , 'lineage_count_all'
+                      , 'lineage_count_unique'
+                      ]
+
+X_gn_Fcat = ['drtype_mode_labels'  # beware then you can't use it to predict [USED it for uq_v1, not v2]
+               #, 'gene_name' # will be required for the combined stuff
+             ]
+
+X_genomicFN = X_gn_mafor_Fnum + X_gn_linegae_Fnum + X_gn_Fcat
+###############################################################################
+# Feature groups further collaps:
+X_structural_FN =  X_stability_FN + X_affinityFN + X_resprop_FN
+
+all_featuresN = X_evolFN + X_structural_FN + X_genomicFN
+
+###############################################################################
+#%% Define training and test data
+#======================================================
 # Training and BLIND test set [UQ]: actual vs imputed
 # No aa index but active_site included
 # dst with actual values  : training set
 # dst with imputed values : blind test
-#==================================================
+#======================================================
 my_df_ml[drug].isna().sum()  #'na' ones are the blind_test set
 
 blind_test_df = my_df_ml[my_df_ml[drug].isna()]
@@ -567,6 +602,7 @@ training_df.shape
 # Target 1: dst_mode
 training_df[drug].value_counts()
 training_df['dst_mode'].value_counts()
+
 ####################################################################
 #============
 # ML data
@@ -574,8 +610,8 @@ training_df['dst_mode'].value_counts()
 #------
 # X: Training and Blind test (BTS)
 #------
-X     = training_df[numerical_FN + categorical_FN] 
-X_bts = blind_test_df[numerical_FN + categorical_FN] 
+X     = training_df[all_featuresN] 
+X_bts = blind_test_df[all_featuresN] 
 
 #------
 # y
@@ -601,19 +637,67 @@ yc1_ratio = yc1[0]/yc1[1]
 yc2 = Counter(y_bts)
 yc2_ratio = yc2[0]/yc2[1]
 
+###############################################################################
+#======================================================
+# Determine categorical and numerical features
+#======================================================
+numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
+numerical_cols 
+categorical_cols = X.select_dtypes(include=['object', 'bool']).columns
+categorical_cols 
+
+################################################################################
+# IMPORTANT sanity checks
+if len(X.columns) == len(X_evolFN) + len(X_stability_FN) + len(X_affinityFN) + len(X_resprop_FN) + len(X_genomicFN):
+    print('\nPASS: ML data with input features, training and test generated...'
+          , '\n\nTotal no. of input features:'        , len(X.columns)
+          , '\n--------No. of numerical features:'    , len(numerical_cols)
+          , '\n--------No. of categorical features:'  , len(categorical_cols)
+          
+          , '\n\nTotal no. of evolutionary features:' , len(X_evolFN)
+          
+          , '\n\nTotal no. of stability features:'    , len(X_stability_FN)
+          , '\n--------Common stabilty cols:'         , len(X_common_stability_Fnum)
+          , '\n--------Foldx cols:'                   , len(X_foldX_Fnum)
+          
+          , '\n\nTotal no. of affinity features:'     , len(X_affinityFN)
+          , '\n--------Common affinity cols:'         , len(common_affinity_Fnum)
+          , '\n--------Gene specific affinity cols:'     , len(gene_affinity_colnames)
+          
+          , '\n\nTotal no. of residue level features:', len(X_resprop_FN)
+          , '\n--------AA index cols:'                , len(X_aaindex_Fnum)
+          , '\n--------Residue Prop cols:'            , len(X_str_Fnum)
+          , '\n--------AA change Prop cols:'          , len(X_aap_Fcat)
+          
+          , '\n\nTotal no. of genomic features:'      , len(X_genomicFN)
+          , '\n--------MAF+OR cols:'                  , len(X_gn_mafor_Fnum)
+          , '\n--------Lineage cols:'                 , len(X_gn_linegae_Fnum)
+          , '\n--------Other cols:'                   , len(X_gn_Fcat)
+          )
+else:
+    print('\nFAIL: numbers mismatch'
+          , '\nExpected:',len(X_evolFN) + len(X_stability_FN) + len(X_affinityFN) + len(X_resprop_FN) + len(X_genomicFN)
+          , '\nGot:', len(X.columns))
+    sys.exit()
+###############################################################################
 print('\n-------------------------------------------------------------'
-      , '\nSuccessfully split data: UQ [no aa_index but active site included] training'
+      , '\nSuccessfully split data: ALL features'
       , '\nactual values: training set'
       , '\nimputed values: blind test set'
-      , '\nTrain data size:', X.shape
-      , '\nTest data size:', X_bts.shape
+      
+      , '\n\nTotal data size:', len(X) + len(X_bts)
+
+      , '\n\nTrain data size:', X.shape
       , '\ny_train numbers:', yc1
-      , '\ny_train ratio:',yc1_ratio
-      , '\n'
+
+      , '\n\nTest data size:', X_bts.shape
       , '\ny_test_numbers:', yc2
+
+      , '\n\ny_train ratio:',yc1_ratio
       , '\ny_test ratio:', yc2_ratio
       , '\n-------------------------------------------------------------'
       )
+
 ###########################################################################
 #%% 
 ###########################################################################
diff --git a/scripts/ml/pnca_config_dissected.py b/scripts/ml/pnca_config_dissected.py
index a4b3873..24367d3 100644
--- a/scripts/ml/pnca_config_dissected.py
+++ b/scripts/ml/pnca_config_dissected.py
@@ -47,60 +47,78 @@ outdir_ml = outdir + 'ml/uq_v1/dissected'
 print('\nOutput directory:', outdir_ml)
 
 #%%###########################################################################
-print('\nSanity checks:'
-      , '\nTotal input features:', len(X.columns)
-      , '\n'
-      , '\nTraining data size:', X.shape
-      , '\nTest data size:', X_bts.shape
-      , '\n'
-      , '\nTarget feature numbers (training data):', Counter(y)
-      , '\nTarget features ratio (training data:', yc1_ratio
-      , '\n'
-      , '\nTarget feature numbers (test data):', Counter(y_bts)
-      , '\nTarget features ratio (test data):', yc2_ratio
-      
-      , '\n\n#####################################################################\n')
-
 print('\n================================================================\n')
 
-print('Strucutral features (n):'
-      , len(X_ssFN)
-      , '\nThese are:'
-      , '\nCommon stablity features:', X_stabilityN
-      , '\nFoldX columns:', X_foldX_cols
-      , '\nOther struc columns:', X_str
-      , '\n================================================================\n')
+          , '\n\nTotal no. of evolutionary features:' , len(X_evolFN)
+          
+          , '\n\nTotal no. of stability features:'    , len(X_stability_FN)
+          , '\n--------Common stabilty cols:'         , len(X_common_stability_Fnum)
+          , '\n--------Foldx cols:'                   , len(X_foldX_Fnum)
+          
+          , '\n\nTotal no. of affinity features:'     , len(X_affinityFN)
+          , '\n--------Common affinity cols:'         , len(common_affinity_Fnum)
+          , '\n--------Gene specific affinity cols:'  , len(gene_affinity_colnames)
+          
+          , '\n\nTotal no. of residue level features:', len(X_resprop_FN)
+          , '\n--------AA index cols:'                , len(X_aaindex_Fnum)
+          , '\n--------Residue Prop cols:'            , len(X_str_Fnum)
+          , '\n--------AA change Prop cols:'          , len(X_aap_Fcat)
+          
+          , '\n\nTotal no. of genomic features:'      , len(X_genomicFN)
+          , '\n--------MAF+OR cols:'                  , len(X_gn_mafor_Fnum)
+          , '\n--------Lineage cols:'                 , len(X_gn_linegae_Fnum)
+          , '\n--------Other cols:'                   , len(X_gn_Fcat)
 
-# print('AAindex features (n):'
-#       , len(X_aaindexFN)
-#       , '\nThese are:\n'
-#       , X_aaindexFN
-#       , '\n================================================================\n')
+X_structural_FN =  X_stability_FN + X_affinityFN + X_resprop_FN
+ X_aaindex_Fnum + X_str_Fnum + X_aap_Fcat
+all_featuresN = X_evolFN + X_structural_FN + X_genomicFN
+     
+###############################################################################
 
-print('Evolutionary features (n):'
-      , len(X_evolFN)
-      , '\nThese are:\n'
-      , X_evolFN
-      , '\n================================================================\n')
+print('\n================================================================'
+      
+      , '\nTotal Evolutionary features (n):' , len(X_evolFN)
+      , '\n--------------Evol. feature colnames:', X_evolFN
+      
+      , '\n================================================================'
+      
+      , '\n\nTotal structural features (n):', len(X_structural_FN)
+      
+      , '\n--------Stability ncols:'                      , len(X_stability_FN)
+      , '\n--------------Common stability colnames:'      , X_common_stability_Fnum
+      , '\n--------------Foldx colnames:'                 , X_foldX_Fnum
+     
+      , '\n--------Affinity ncols:'                       , len(X_affinityFN)
+      , '\n--------------Common affinity colnames:'       , common_affinity_Fnum
+      , '\n--------------Gene specific affinity colnames:', gene_affinity_colnames
 
-print('Genomic features (n):'
-      , len(X_genomicFN)
-      , '\nThese are:\n'
-      , X_genomic_mafor, '\n'
-      , X_genomic_linegae
-      , '\n================================================================\n')
+      , '\n--------Residue prop ncols:'                   , len(X_resprop_FN)
+      , '\n--------------Residue Prop cols:'              , X_str_Fnum
+      , '\n--------------AA change Prop cols:'            , X_aap_Fcat
+      , '\n--------------AA index cols:'                  , X_aaindex_Fnum
+      
+      , '\n================================================================'
+      
+      , '\n\nTotal Genomic features (n):'   , len(X_genomicFN)
+      , '\n--------MAF+OR cols:'                         , len(X_gn_mafor_Fnum)
+      , '\n--------------MAF+OR colnames:'               , X_gn_mafor_Fnum
 
-print('Categorical features (n):'
-      , len(categorical_FN)
-      , '\nThese are:\n'
-      , categorical_FN
-      , '\n================================================================\n')
+      , '\n--------Lineage cols:'                        , len(X_gn_linegae_Fnum)
+      , '\n--------------Lineage cols:'                  , X_gn_linegae_Fnum
 
-#if ( len(X.columns) ==  len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
-if ( len(X.columns) ==  len(X_ssFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
+      , '\n--------Other cols:'                          , len(X_gn_Fcat)
+      , '\n--------------Other cols:'                    , X_gn_Fcat
+      
+      , '\n================================================================')
+
+# Sanity check
+if ( len(X.columns) ==  len(X_evolFN) + len(X_structural_FN) + len(X_genomicFN)):
     print('\nPass: No. of features match')
 else:
-    sys.exit('\nFail: Count of feature mismatch')
+    print('\nFail: Count of feature mismatch'
+          , '\nExpected:', len(X_evolFN) + len(X_structural_FN) + len(X_genomicFN)
+          , '\nGot:', len(X.columns))
+    sys.exit()
 
 print('\n#####################################################################\n')
 
@@ -108,7 +126,7 @@ print('\n#####################################################################\n
 # #==================
 # # Baseline models 
 # #==================
-# mm_skf_scoresD = MultModelsCl(input_df = X
+# mm_skf_scoresD = MultModelsCl_dissected(input_df = X
 #                                         , target = y
 #                                         , var_type = 'mixed'
 #                                         , skf_cv = skf_cv