diff --git a/scripts/ml/ml_functions/GetMLData.py b/scripts/ml/ml_functions/GetMLData.py
index 3410389..bdbd70e 100755
--- a/scripts/ml/ml_functions/GetMLData.py
+++ b/scripts/ml/ml_functions/GetMLData.py
@@ -101,14 +101,15 @@ def getmldata(gene, drug
     datadir = homedir + '/git/Data/'
     indir   = datadir + drug + '/input/'
     outdir  = datadir + drug + '/output/'
-    outdir_ml = outdir + 'ml/'
-      
+    #outdir_ml = outdir + 'ml/'
+    outdir_ml = homedir + '/git/LSHTM_ML/output/'
+
     #==========================
     # outfile for ML training:
     #==========================
-    outFile_ml = outdir_ml + gene.lower() + '_training_data.csv'
+    outFile_ml = outdir_ml +  gene.lower() + '_training_data.csv'
    
-    outFile_mask_ml = outdir_ml + gene.lower() + '_mask_check.csv'
+    outFile_mask_ml = outdir_ml + 'genes/mask_check/' + gene.lower() + '_mask_check.csv'
     
     #=======
     # input
@@ -436,41 +437,58 @@ def getmldata(gene, drug
         #X_stabilityN = common_cols_stabiltyN
         gene_affinity_colnames = []# not needed as its the common ones 
         cols_to_mask = ['ligand_affinity_change']
-        
+        cols_to_mask_ppi2 = []
+
     if gene.lower() in geneL_ppi2:
         gene_affinity_colnames = ['mcsm_ppi2_affinity', 'interface_dist'] 
         #X_stabilityN = common_cols_stabiltyN + geneL_ppi2_st_cols
-        cols_to_mask = ['ligand_affinity_change', 'mcsm_ppi2_affinity']
+        #cols_to_mask = ['ligand_affinity_change', 'mcsm_ppi2_affinity']
+        cols_to_mask = ['ligand_affinity_change']
+        cols_to_mask_ppi2 = ['mcsm_ppi2_affinity']
+
     
     if gene.lower() in geneL_na:
         gene_affinity_colnames =  ['mcsm_na_affinity'] 
         #X_stabilityN = common_cols_stabiltyN + geneL_na_st_cols
         cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity']
+        cols_to_mask_ppi2 = []
+
     
     if gene.lower() in geneL_na_ppi2:
         gene_affinity_colnames = ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] 
         #X_stabilityN = common_cols_stabiltyN + geneL_na_ppi2_st_cols
-        cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity', 'mcsm_ppi2_affinity']
+        #cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity', 'mcsm_ppi2_affinity']
+        cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity']
+        cols_to_mask_ppi2 = ['mcsm_ppi2_affinity']
     
     #=======================
-    # Masking columns:
-    # (mCSM-lig, mCSM-NA, mCSM-ppi2) values for lig_dist >10
+    # Masking columns:        
+    # lig_dist >10 ==> mCSM-lig AND mCSM-NA col values == 0    
+    # interface_dist >10 ==>  mCSM-ppi2 col values == 0
     #=======================
     my_df_ml['mutationinformation'][my_df_ml['ligand_distance']>10].value_counts()
     my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts()
     my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask].value_counts()
     
-    # mask the mcsm affinity related columns where ligand distance > 10
+    # mask the mcsm ligand affinity AND mcsm_na affinity columns where ligand distance > 10
     my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask] = 0
-    (my_df_ml['ligand_affinity_change'] == 0).sum()
+
+    # mask the mcsm_ppi2_affinity column where interface_dist > 10
+    if len(cols_to_mask_ppi2) > 0:
+        my_df_ml.loc[(my_df_ml['interface_dist'] > 10), cols_to_mask_ppi2] = 0
+        add_cols_mask = ['interface_dist'] + cols_to_mask_ppi2
+        mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask + add_cols_mask]
+    else:
+        mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask ]
     
-    mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask]  
+    # sanity check: check script SANITY_CHECK_mask.py
     
-    #===================================================
-    # write file for check
-    #mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
-    #mask_check.to_csv(outdir + 'ml/' + gene.lower() + '_mask_check.csv')
-    #===================================================
+    if write_maskfile:
+        # write mask file for sanity check
+        #mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
+
+        mask_check.to_csv(outdir_ml + gene.lower() + '_mask_check.csv')
+
     ###############################################################################
     #%% Feature groups (FG): Build X for Input ML 
     ############################################################################
diff --git a/scripts/ml/ml_functions/MultClfs.py b/scripts/ml/ml_functions/MultClfs.py
index c4a5be1..dafe756 100755
--- a/scripts/ml/ml_functions/MultClfs.py
+++ b/scripts/ml/ml_functions/MultClfs.py
@@ -77,6 +77,7 @@ import re
 import itertools
 from sklearn.model_selection import LeaveOneGroupOut
 from sklearn.decomposition import PCA
+from sklearn.naive_bayes import ComplementNB
 
 #%% GLOBALS
 #rs = {'random_state': 42}
@@ -260,6 +261,8 @@ def MultModelsCl(input_df, target
     #======================================================
     models = [('AdaBoost Classifier'         , AdaBoostClassifier(**rs) )
               , ('Bagging Classifier'        , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True, verbose = 3, n_estimators = 100) )
+              #, ('Bernoulli NB'               , BernoulliNB() ) # pks Naive Bayes, CAUTION
+              , ('Complement NB'             , ComplementNB() )
               , ('Decision Tree'             , DecisionTreeClassifier(**rs) ) 
               , ('Extra Tree'                , ExtraTreeClassifier(**rs) )
               , ('Extra Trees'               , ExtraTreesClassifier(**rs) ) 
@@ -271,8 +274,8 @@ def MultModelsCl(input_df, target
               , ('Logistic Regression'       , LogisticRegression(**rs) )
               , ('Logistic RegressionCV'     , LogisticRegressionCV(cv = 3, **rs))
               , ('MLP'                       , MLPClassifier(max_iter = 500, **rs) ) 
-              , ('Multinomial'               , MultinomialNB() )
-              , ('Naive Bayes'               , BernoulliNB() )
+              , ('Multinomial NB'               , MultinomialNB() )
+
               , ('Passive Aggresive'         , PassiveAggressiveClassifier(**rs, **njobs) )
               , ('QDA'                       , QuadraticDiscriminantAnalysis() )
               , ('Random Forest'             , RandomForestClassifier(**rs, n_estimators = 1000, **njobs ) ) 
diff --git a/scripts/ml/ml_functions/test_func_singlegene.py b/scripts/ml/ml_functions/test_func_singlegene.py
index a132b89..707b188 100644
--- a/scripts/ml/ml_functions/test_func_singlegene.py
+++ b/scripts/ml/ml_functions/test_func_singlegene.py
@@ -18,7 +18,6 @@ from SplitTTS import *
 from MultClfs_SIMPLE import *
 
 #%%
-
 skf_cv = StratifiedKFold(n_splits = 10
                             , shuffle = True,**rs)
 #sel_cv = logo
@@ -29,16 +28,16 @@ skf_cv = StratifiedKFold(n_splits = 10
 gene_model_paramD = {'data_combined_model'       : False
                     , 'use_or'                   : False
                     , 'omit_all_genomic_features': False
-                    , 'write_maskfile'           : False
+                    , 'write_maskfile'           : True
                     , 'write_outfile'            : False }
 
 #df = getmldata(gene, drug, **gene_model_paramD)
 df = getmldata('pncA', 'pyrazinamide', **gene_model_paramD)
-df = getmldata('embB', 'ethambutol'   , **gene_model_paramD)
-df = getmldata('katG', 'isoniazid'    , **gene_model_paramD)
-df = getmldata('rpoB', 'rifampicin'   , **gene_model_paramD)
-df  = getmldata('gid' , 'streptomycin' , **gene_model_paramD)
-#df  = getmldata('alr' , 'cycloserine'  , **combined_model_paramD)
+#df = getmldata('embB', 'ethambutol'   , **gene_model_paramD)
+#df = getmldata('katG', 'isoniazid'    , **gene_model_paramD)
+#df = getmldata('rpoB', 'rifampicin'   , **gene_model_paramD)
+#df  = getmldata('gid' , 'streptomycin' , **gene_model_paramD)
+#df  = getmldata('alr' , 'cycloserine'  , **gene_model_paramD)
 
 all(df.columns.isin(['gene_name'])) # should be False
 spl_type = '70_30'