fixed masking condition for ML training data for genes and wrote revised mask files out

2022-07-27 13:36:16 +01:00 · 2022-07-27 13:36:16 +01:00 · f4cab1fdfb
commit f4cab1fdfb
parent 0adf69f75a
3 changed files with 46 additions and 26 deletions
--- a/scripts/ml/ml_functions/GetMLData.py
+++ b/scripts/ml/ml_functions/GetMLData.py
@ -101,14 +101,15 @@ def getmldata(gene, drug
    datadir = homedir + '/git/Data/'
    indir   = datadir + drug + '/input/'
    outdir  = datadir + drug + '/output/'
-    outdir_ml = outdir + 'ml/'
+    #outdir_ml = outdir + 'ml/'
+    outdir_ml = homedir + '/git/LSHTM_ML/output/'

    #==========================
    # outfile for ML training:
    #==========================
-    outFile_ml = outdir_ml + gene.lower() + '_training_data.csv'
+    outFile_ml = outdir_ml +  gene.lower() + '_training_data.csv'
   
-    outFile_mask_ml = outdir_ml + gene.lower() + '_mask_check.csv'
+    outFile_mask_ml = outdir_ml + 'genes/mask_check/' + gene.lower() + '_mask_check.csv'
    
    #=======
    # input
@ -436,41 +437,58 @@ def getmldata(gene, drug
        #X_stabilityN = common_cols_stabiltyN
        gene_affinity_colnames = []# not needed as its the common ones 
        cols_to_mask = ['ligand_affinity_change']
+        cols_to_mask_ppi2 = []

    if gene.lower() in geneL_ppi2:
        gene_affinity_colnames = ['mcsm_ppi2_affinity', 'interface_dist'] 
        #X_stabilityN = common_cols_stabiltyN + geneL_ppi2_st_cols
-        cols_to_mask = ['ligand_affinity_change', 'mcsm_ppi2_affinity']
+        #cols_to_mask = ['ligand_affinity_change', 'mcsm_ppi2_affinity']
+        cols_to_mask = ['ligand_affinity_change']
+        cols_to_mask_ppi2 = ['mcsm_ppi2_affinity']
+
    
    if gene.lower() in geneL_na:
        gene_affinity_colnames =  ['mcsm_na_affinity'] 
        #X_stabilityN = common_cols_stabiltyN + geneL_na_st_cols
        cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity']
+        cols_to_mask_ppi2 = []
+
    
    if gene.lower() in geneL_na_ppi2:
        gene_affinity_colnames = ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] 
        #X_stabilityN = common_cols_stabiltyN + geneL_na_ppi2_st_cols
-        cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity', 'mcsm_ppi2_affinity']
+        #cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity', 'mcsm_ppi2_affinity']
+        cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity']
+        cols_to_mask_ppi2 = ['mcsm_ppi2_affinity']
    
    #=======================
    # Masking columns:        
-    # (mCSM-lig, mCSM-NA, mCSM-ppi2) values for lig_dist >10
+    # lig_dist >10 ==> mCSM-lig AND mCSM-NA col values == 0    
+    # interface_dist >10 ==>  mCSM-ppi2 col values == 0
    #=======================
    my_df_ml['mutationinformation'][my_df_ml['ligand_distance']>10].value_counts()
    my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts()
    my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask].value_counts()
    
-    # mask the mcsm affinity related columns where ligand distance > 10
+    # mask the mcsm ligand affinity AND mcsm_na affinity columns where ligand distance > 10
    my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask] = 0
-    (my_df_ml['ligand_affinity_change'] == 0).sum()

-    mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask]  
+    # mask the mcsm_ppi2_affinity column where interface_dist > 10
+    if len(cols_to_mask_ppi2) > 0:
+        my_df_ml.loc[(my_df_ml['interface_dist'] > 10), cols_to_mask_ppi2] = 0
+        add_cols_mask = ['interface_dist'] + cols_to_mask_ppi2
+        mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask + add_cols_mask]
+    else:
+        mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask ]
+    
+    # sanity check: check script SANITY_CHECK_mask.py
+    
+    if write_maskfile:
+        # write mask file for sanity check
+        #mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
+
+        mask_check.to_csv(outdir_ml + gene.lower() + '_mask_check.csv')

-    #===================================================
-    # write file for check
-    #mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
-    #mask_check.to_csv(outdir + 'ml/' + gene.lower() + '_mask_check.csv')
-    #===================================================
    ###############################################################################
    #%% Feature groups (FG): Build X for Input ML 
    ############################################################################
--- a/scripts/ml/ml_functions/MultClfs.py
+++ b/scripts/ml/ml_functions/MultClfs.py
@ -77,6 +77,7 @@ import re
 import itertools
 from sklearn.model_selection import LeaveOneGroupOut
 from sklearn.decomposition import PCA
+from sklearn.naive_bayes import ComplementNB

 #%% GLOBALS
 #rs = {'random_state': 42}
@ -260,6 +261,8 @@ def MultModelsCl(input_df, target
    #======================================================
    models = [('AdaBoost Classifier'         , AdaBoostClassifier(**rs) )
              , ('Bagging Classifier'        , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True, verbose = 3, n_estimators = 100) )
+              #, ('Bernoulli NB'               , BernoulliNB() ) # pks Naive Bayes, CAUTION
+              , ('Complement NB'             , ComplementNB() )
              , ('Decision Tree'             , DecisionTreeClassifier(**rs) ) 
              , ('Extra Tree'                , ExtraTreeClassifier(**rs) )
              , ('Extra Trees'               , ExtraTreesClassifier(**rs) ) 
@ -271,8 +274,8 @@ def MultModelsCl(input_df, target
              , ('Logistic Regression'       , LogisticRegression(**rs) )
              , ('Logistic RegressionCV'     , LogisticRegressionCV(cv = 3, **rs))
              , ('MLP'                       , MLPClassifier(max_iter = 500, **rs) ) 
-              , ('Multinomial'               , MultinomialNB() )
-              , ('Naive Bayes'               , BernoulliNB() )
+              , ('Multinomial NB'               , MultinomialNB() )
+
              , ('Passive Aggresive'         , PassiveAggressiveClassifier(**rs, **njobs) )
              , ('QDA'                       , QuadraticDiscriminantAnalysis() )
              , ('Random Forest'             , RandomForestClassifier(**rs, n_estimators = 1000, **njobs ) ) 
--- a/scripts/ml/ml_functions/test_func_singlegene.py
+++ b/scripts/ml/ml_functions/test_func_singlegene.py
@ -18,7 +18,6 @@ from SplitTTS import *
 from MultClfs_SIMPLE import *

 #%%
-
 skf_cv = StratifiedKFold(n_splits = 10
                            , shuffle = True,**rs)
 #sel_cv = logo
@ -29,16 +28,16 @@ skf_cv = StratifiedKFold(n_splits = 10
 gene_model_paramD = {'data_combined_model'       : False
                    , 'use_or'                   : False
                    , 'omit_all_genomic_features': False
-                    , 'write_maskfile'           : False
+                    , 'write_maskfile'           : True
                    , 'write_outfile'            : False }

 #df = getmldata(gene, drug, **gene_model_paramD)
 df = getmldata('pncA', 'pyrazinamide', **gene_model_paramD)
-df = getmldata('embB', 'ethambutol'   , **gene_model_paramD)
-df = getmldata('katG', 'isoniazid'    , **gene_model_paramD)
-df = getmldata('rpoB', 'rifampicin'   , **gene_model_paramD)
-df  = getmldata('gid' , 'streptomycin' , **gene_model_paramD)
-#df  = getmldata('alr' , 'cycloserine'  , **combined_model_paramD)
+#df = getmldata('embB', 'ethambutol'   , **gene_model_paramD)
+#df = getmldata('katG', 'isoniazid'    , **gene_model_paramD)
+#df = getmldata('rpoB', 'rifampicin'   , **gene_model_paramD)
+#df  = getmldata('gid' , 'streptomycin' , **gene_model_paramD)
+#df  = getmldata('alr' , 'cycloserine'  , **gene_model_paramD)

 all(df.columns.isin(['gene_name'])) # should be False
 spl_type = '70_30'