added FS to MultClfs.py and modified data for different splits for consistency

2022-06-24 20:35:53 +01:00 · 2022-06-24 20:35:53 +01:00 · e2bc384155
commit e2bc384155
parent edb7aebd6a
12 changed files with 1585 additions and 994 deletions
--- a/scripts/ml/ml_data_cd_sl.py
+++ b/scripts/ml/ml_data_cd_sl.py
@ -34,7 +34,11 @@ def setvars(gene,drug):
    from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold
    
    from sklearn.pipeline import Pipeline, make_pipeline
+    import argparse
+    import re
    #%% GLOBALS
+    tts_split = "sl"
+
    rs = {'random_state': 42}
    njobs = {'n_jobs': 10}
    
@ -56,12 +60,10 @@ def setvars(gene,drug):
                                      , **rs)
    
    mcc_score_fn  = {'mcc': make_scorer(matthews_corrcoef)}
-    jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
-    
+    jacc_score_fn = {'jcc': make_scorer(jaccard_score)}   
    #%% FOR LATER: Combine ED logo data
    ###########################################################################
-    rs = {'random_state': 42}
-    njobs = {'n_jobs': 10}
+
    homedir = os.path.expanduser("~")
    
    geneL_basic     = ['pnca']
@ -422,118 +424,31 @@ def setvars(gene,drug):
    #==========================
    my_df_ml = my_df.copy()
    
-    #%% Build X: input for ML
-    common_cols_stabiltyN = ['ligand_distance'
-               , 'ligand_affinity_change'
-               , 'duet_stability_change'
-               , 'ddg_foldx'
-               , 'deepddg'
-               , 'ddg_dynamut2'
-               , 'mmcsm_lig'
-               , 'contacts']
-    
-    # Build stability columns ~ gene
+    # Build column names to mask for affinity chanhes
    if gene.lower() in geneL_basic:
-        X_stabilityN = common_cols_stabiltyN
+        #X_stabilityN = common_cols_stabiltyN
+        gene_affinity_colnames = []# not needed as its the common ones 
        cols_to_mask = ['ligand_affinity_change']
        
    if gene.lower() in geneL_ppi2:
-    #    X_stabilityN = common_cols_stabiltyN + ['mcsm_ppi2_affinity' , 'interface_dist'] 
-        geneL_ppi2_st_cols = ['mcsm_ppi2_affinity', 'interface_dist'] 
-        X_stabilityN = common_cols_stabiltyN + geneL_ppi2_st_cols
+        gene_affinity_colnames = ['mcsm_ppi2_affinity', 'interface_dist'] 
+        #X_stabilityN = common_cols_stabiltyN + geneL_ppi2_st_cols
        cols_to_mask = ['ligand_affinity_change', 'mcsm_ppi2_affinity']
    
    if gene.lower() in geneL_na:
-    #    X_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] 
-        geneL_na_st_cols =  ['mcsm_na_affinity'] 
-        X_stabilityN = common_cols_stabiltyN + geneL_na_st_cols
+        gene_affinity_colnames =  ['mcsm_na_affinity'] 
+        #X_stabilityN = common_cols_stabiltyN + geneL_na_st_cols
        cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity']
    
    if gene.lower() in geneL_na_ppi2:
-    #    X_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] 
-        geneL_na_ppi2_st_cols = ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] 
-        X_stabilityN = common_cols_stabiltyN + geneL_na_ppi2_st_cols
+        gene_affinity_colnames = ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] 
+        #X_stabilityN = common_cols_stabiltyN + geneL_na_ppi2_st_cols
        cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity', 'mcsm_ppi2_affinity']
    
-    
-    X_foldX_cols = [ 'electro_rr', 'electro_mm', 'electro_sm', 'electro_ss'
-    , 'disulfide_rr', 'disulfide_mm', 'disulfide_sm', 'disulfide_ss'
-    , 'hbonds_rr', 'hbonds_mm', 'hbonds_sm', 'hbonds_ss'
-    , 'partcov_rr', 'partcov_mm', 'partcov_sm', 'partcov_ss'
-    , 'vdwclashes_rr', 'vdwclashes_mm', 'vdwclashes_sm', 'vdwclashes_ss'
-    , 'volumetric_rr', 'volumetric_mm', 'volumetric_ss'
-    ]
-    
-    X_str =  ['rsa'
-               #, 'asa'
-               , 'kd_values'
-               , 'rd_values']    
-    
-    X_ssFN = X_stabilityN + X_str + X_foldX_cols
-    
-    X_evolFN =  ['consurf_score'
-               , 'snap2_score'
-               , 'provean_score']
-        
-    X_genomic_mafor =  ['maf'
-                    , 'logorI'
-                    # , 'or_rawI'
-                    # , 'or_mychisq'
-                    # , 'or_logistic'
-                    # , 'or_fisher'
-                    # , 'pval_fisher'
-                    ]
-    
-    X_genomic_linegae  = ['lineage_proportion'
-                          , 'dist_lineage_proportion'
-                          #, 'lineage' # could be included as a category but it has L2;L4  formatting
-                          , 'lineage_count_all'
-                          , 'lineage_count_unique'
-                          ]
-    
-    X_genomicFN = X_genomic_mafor + X_genomic_linegae
-    
-    X_aaindexFN = list(aa_df_cols)
-    
-    print('\nTotal no. of features for aaindex:', len(X_aaindexFN))
-    
-    # numerical feature names
-    numerical_FN = X_ssFN  + X_evolFN + X_genomicFN + X_aaindexFN
-      
-    # categorical feature names
-    categorical_FN = ['ss_class'
-                # , 'wt_prop_water'
-                # , 'mut_prop_water'
-                # , 'wt_prop_polarity'
-                # , 'mut_prop_polarity'
-                # , 'wt_calcprop'
-                # , 'mut_calcprop'
-                , 'aa_prop_change'
-                , 'electrostatics_change'
-                , 'polarity_change'
-                , 'water_change'
-                , 'drtype_mode_labels' # beware then you can't use it to predict [USED it for uq_v1, not v2]
-                , 'active_site' #[didn't use it for uq_v1]
-                #, 'gene_name' # will be required for the combined stuff
-                 ]
-                 
-    #----------------------------------------------
-    # count numerical and categorical features
-    #----------------------------------------------
-    
-    print('\nNo. of numerical features:', len(numerical_FN)
-          , '\nNo. of categorical features:', len(categorical_FN))
-    
    #=======================
    # Masking columns:
    # (mCSM-lig, mCSM-NA, mCSM-ppi2) values for lig_dist >10
    #=======================
-    # my_df_ml['mutationinformation'][my_df['ligand_distance']>10].value_counts()
-    # my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts()
-    
-    # my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), 'ligand_affinity_change'] = 0
-    # (my_df_ml['ligand_affinity_change'] == 0).sum()
-    
    my_df_ml['mutationinformation'][my_df_ml['ligand_distance']>10].value_counts()
    my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts()
    my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask].value_counts()
@ -544,24 +459,154 @@ def setvars(gene,drug):
    
    mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask]  
    
+    #===================================================
    # write file for check
    mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
-    mask_check.to_csv(outdir + 'ml/' + gene.lower() + '_mask_check.csv') 
-       
-    #####################################################################
-    #================================================================
-    # Training and Blind test [COMPLETE data]: scaling law split
-    # https://towardsdatascience.com/finally-why-we-use-an-80-20-split-for-training-and-test-data-plus-an-alternative-method-oh-yes-edc77e96295d
+    mask_check.to_csv(outdir + 'ml/' + gene.lower() + '_mask_check.csv')
+    #===================================================
+    ###############################################################################
+    #%% Feature groups (FG): Build X for Input ML 
+    ############################################################################
+    #===========================
+    # FG1: Evolutionary features
+    #===========================
+    X_evolFN =  ['consurf_score'
+               , 'snap2_score'
+               , 'provean_score']
    
+    ###############################################################################
+    #========================
+    # FG2: Stability features
+    #========================
+    #--------
+    # common
+    #--------
+    X_common_stability_Fnum = [
+               'duet_stability_change'
+               , 'ddg_foldx'
+               , 'deepddg'
+               , 'ddg_dynamut2'
+               , 'contacts']
+    #--------
+    # FoldX
+    #--------
+    X_foldX_Fnum = [ 'electro_rr', 'electro_mm', 'electro_sm', 'electro_ss'
+    , 'disulfide_rr', 'disulfide_mm', 'disulfide_sm', 'disulfide_ss'
+    , 'hbonds_rr', 'hbonds_mm', 'hbonds_sm', 'hbonds_ss'
+    , 'partcov_rr', 'partcov_mm', 'partcov_sm', 'partcov_ss'
+    , 'vdwclashes_rr', 'vdwclashes_mm', 'vdwclashes_sm', 'vdwclashes_ss'
+    , 'volumetric_rr', 'volumetric_mm', 'volumetric_ss']
+    
+    X_stability_FN = X_common_stability_Fnum + X_foldX_Fnum
+    
+    ###############################################################################
+    #===================
+    # FG3: Affinity features
+    #===================
+    common_affinity_Fnum =  ['ligand_distance'
+                    , 'ligand_affinity_change'
+                    , 'mmcsm_lig']
+    
+    # if gene.lower() in geneL_basic:
+    #     X_affinityFN = common_affinity_Fnum 
+    # else:
+    #     X_affinityFN = common_affinity_Fnum + gene_affinity_colnames
+        
+    X_affinityFN = common_affinity_Fnum + gene_affinity_colnames
+    
+    ###############################################################################
+    #============================
+    # FG4: Residue level features
+    #============================
+    #-----------
+    # AA index
+    #-----------
+    X_aaindex_Fnum = list(aa_df_cols)
+    print('\nTotal no. of features for aaindex:', len(X_aaindex_Fnum))
+    
+    #-----------------
+    # surface area
+    # depth
+    # hydrophobicity
+    #-----------------
+    X_str_Fnum =  ['rsa'
+               #, 'asa'
+               , 'kd_values'
+               , 'rd_values']   
+    
+    #---------------------------
+    # Other aa properties
+    # active site indication
+    #---------------------------
+    X_aap_Fcat = ['ss_class'
+                # , 'wt_prop_water'
+                # , 'mut_prop_water'
+                # , 'wt_prop_polarity'
+                # , 'mut_prop_polarity'
+                # , 'wt_calcprop'
+                # , 'mut_calcprop'
+                , 'aa_prop_change'
+                , 'electrostatics_change'
+                , 'polarity_change'
+                , 'water_change'
+                , 'active_site']
+       
+    X_resprop_FN = X_aaindex_Fnum + X_str_Fnum + X_aap_Fcat
+    ###############################################################################
+    #========================
+    # FG5: Genomic features
+    #========================
+    X_gn_mafor_Fnum =  ['maf'
+                    #, 'logorI'
+                    # , 'or_rawI'
+                    # , 'or_mychisq'
+                    # , 'or_logistic'
+                    # , 'or_fisher'
+                    # , 'pval_fisher'
+                    ]
+    
+    X_gn_linegae_Fnum  = ['lineage_proportion'
+                          , 'dist_lineage_proportion'
+                          #, 'lineage' # could be included as a category but it has L2;L4  formatting
+                          , 'lineage_count_all'
+                          , 'lineage_count_unique'
+                          ]
+    
+    # X_gn_Fcat = ['drtype_mode_labels'  # beware then you can't use it to predict [USED it for uq_v1, not v2]
+    #                #, 'gene_name' # will be required for the combined stuff
+    #              ]
+    X_gn_Fcat = []
+    
+    X_genomicFN = X_gn_mafor_Fnum + X_gn_linegae_Fnum + X_gn_Fcat
+    ###############################################################################
+    #========================
+    # FG6 collapsed: Structural : Atability + Affinity + ResidueProp
+    #========================
+    X_structural_FN =  X_stability_FN + X_affinityFN + X_resprop_FN
+    
+    ###############################################################################
+    #========================
+    # BUILDING all features
+    #========================
+    all_featuresN = X_evolFN + X_structural_FN + X_genomicFN
+    
+    ###############################################################################
+    #%% Define training and test data
+    #================================================================
+    # Training and BLIND test set: scaling law split
+    #https://towardsdatascience.com/finally-why-we-use-an-80-20-split-for-training-and-test-data-plus-an-alternative-method-oh-yes-edc77e96295d
+    # dst with actual values  : training set
+    # dst with imputed values : THROW AWAY [unrepresentative]
    # test data size ~ 1/sqrt(features NOT including target variable)
    #================================================================
    my_df_ml[drug].isna().sum()
+    
    #    blind_test_df = my_df_ml[my_df_ml[drug].isna()]
    #    blind_test_df.shape
    
-    #training_df =  my_df_ml[my_df_ml[drug].notna()]
+    #training_df = my_df_ml[my_df_ml[drug].notna()]
    #training_df.shape
-    
+
    training_df = my_df_ml.copy()
    
    # Target 1: dst_mode
@ -569,80 +614,14 @@ def setvars(gene,drug):
    training_df['dst_mode'].value_counts()
    
    ####################################################################
-    
-###############################################################################
-###############################################################################
-    # #%% extracting dfs based on numerical, categorical column names
-    # #----------------------------------
-    # # WITHOUT the target var included
-    # #----------------------------------
-    # num_df = training_df[numerical_FN]
-    # num_df.shape
-    
-    # cat_df = training_df[categorical_FN]
-    # cat_df.shape
-    
-    # all_df = training_df[numerical_FN + categorical_FN]
-    # all_df.shape
-    
-    # #------------------------------
-    # # WITH the target var included:
-    #     #'wtgt': with target
-    # #------------------------------
-    # # drug and dst_mode should be the same thing
-    # num_df_wtgt = training_df[numerical_FN + ['dst_mode']]
-    # num_df_wtgt.shape
-    
-    # cat_df_wtgt = training_df[categorical_FN + ['dst_mode']]
-    # cat_df_wtgt.shape
-    
-    # all_df_wtgt = training_df[numerical_FN + categorical_FN + ['dst_mode']]
-    # all_df_wtgt.shape
-    
-    #%%########################################################################
-    # #============
-    # # ML data: OLD
-    # #============
-    # #------
-    # # X: Training and Blind test (BTS)
-    # #------
-    # X     = all_df_wtgt[numerical_FN + categorical_FN] # training data ALL
-    # X_bts = blind_test_df[numerical_FN + categorical_FN] # blind test data ALL
-    # #X = all_df_wtgt[numerical_FN] # training numerical only
-    # #X_bts = blind_test_df[numerical_FN] # blind test data numerical
-    
-    # #------
-    # # y
-    # #------
-    # y = all_df_wtgt['dst_mode'] # training data y
-    # y_bts = blind_test_df['dst_mode'] # blind data test y
-    
-    # #X_bts_wt = blind_test_df[numerical_FN + ['dst_mode']] 
-    
-    # # Quick check
-    # #(X['ligand_affinity_change']==0).sum() == (X['ligand_distance']>10).sum()
-    # for i in range(len(cols_to_mask)):
-    #     ind = i+1
-    #     print('\nindex:', i, '\nind:', ind)
-    #     print('\nMask count check:'
-    #           , (my_df_ml[cols_to_mask[i]]==0).sum() == (my_df_ml['ligand_distance']>10).sum()
-    #           )
-    
-    # print('Original Data\n', Counter(y)
-    #       , 'Data dim:', X.shape)
-    
-###############################################################################
-###############################################################################    
    #====================================
-    # ML data: Train test split [COMPLETE data]: scaling law
+    # ML data: Train test split: SL
    # with stratification
    # 1-blind test : training_data for CV
    # 1/sqrt(columns) : blind test 
-    #=====================================
-      
-    # features: all_df or
-    x_features = training_df[numerical_FN + categorical_FN]
-    y_target = training_df['dst_mode']
+    #===========================================
+    x_features = training_df[all_featuresN]
+    y_target   = training_df['dst_mode']
    
    # sanity check
    if not 'dst_mode' in x_features.columns:
@ -650,12 +629,15 @@ def setvars(gene,drug):
        x_ncols = len(x_features.columns)
        print('\nNo. of columns for x_features:', x_ncols)
        # NEED It for scaling law split
+        #https://towardsdatascience.com/finally-why-we-use-an-80-20-split-for-training-and-test-data-plus-an-alternative-method-oh-yes-edc77e96295d
    else:
        sys.exit('\nFAIL: x_features has target variable included. FIX it and rerun!')
-    
+    #-------------------
+    # train-test split
+    #-------------------
    sl_test_size = 1/np.sqrt(x_ncols)
    train = 1 - sl_test_size
-    
+
    #x_train, x_test, y_train, y_test # traditional var_names
    # so my downstream code doesn't need to change    
    X, X_bts, y, y_bts = train_test_split(x_features, y_target
@ -667,16 +649,65 @@ def setvars(gene,drug):
    
    yc2 = Counter(y_bts)
    yc2_ratio = yc2[0]/yc2[1]
-
+    
+    ###############################################################################
+    #======================================================
+    # Determine categorical and numerical features
+    #======================================================
+    numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
+    numerical_cols 
+    categorical_cols = X.select_dtypes(include=['object', 'bool']).columns
+    categorical_cols 
+    
+    ################################################################################
+    # IMPORTANT sanity checks
+    if len(X.columns) == len(X_evolFN) + len(X_stability_FN) + len(X_affinityFN) + len(X_resprop_FN) + len(X_genomicFN):
+        print('\nPASS: ML data with input features, training and test generated...'
+              , '\n\nTotal no. of input features:'        , len(X.columns)
+              , '\n--------No. of numerical features:'    , len(numerical_cols)
+              , '\n--------No. of categorical features:'  , len(categorical_cols)
+              
+              , '\n\nTotal no. of evolutionary features:' , len(X_evolFN)
+              
+              , '\n\nTotal no. of stability features:'    , len(X_stability_FN)
+              , '\n--------Common stabilty cols:'         , len(X_common_stability_Fnum)
+              , '\n--------Foldx cols:'                   , len(X_foldX_Fnum)
+              
+              , '\n\nTotal no. of affinity features:'     , len(X_affinityFN)
+              , '\n--------Common affinity cols:'         , len(common_affinity_Fnum)
+              , '\n--------Gene specific affinity cols:'  , len(gene_affinity_colnames)
+              
+              , '\n\nTotal no. of residue level features:', len(X_resprop_FN)
+              , '\n--------AA index cols:'                , len(X_aaindex_Fnum)
+              , '\n--------Residue Prop cols:'            , len(X_str_Fnum)
+              , '\n--------AA change Prop cols:'          , len(X_aap_Fcat)
+              
+              , '\n\nTotal no. of genomic features:'      , len(X_genomicFN)
+              , '\n--------MAF+OR cols:'                  , len(X_gn_mafor_Fnum)
+              , '\n--------Lineage cols:'                 , len(X_gn_linegae_Fnum)
+              , '\n--------Other cols:'                   , len(X_gn_Fcat)
+              )
+    else:
+        print('\nFAIL: numbers mismatch'
+              , '\nExpected:',len(X_evolFN) + len(X_stability_FN) + len(X_affinityFN) + len(X_resprop_FN) + len(X_genomicFN)
+              , '\nGot:', len(X.columns))
+        sys.exit()
+    ###############################################################################
    print('\n-------------------------------------------------------------'
-          , '\nSuccessfully split data with stratification according to scaling law [COMPLETE data]: 1/sqrt(x_ncols)'
-          , '\nInput features data size:', x_features.shape
-          , '\nTrain data size:', X.shape
-          , '\nTest data size:', X_bts.shape
+          , '\nSuccessfully split data: ALL features'
+          , '\nactual values: training set'
+          ,  '\nSplit:', tts_split
+          #, '\nimputed values: blind test set'
+          
+          , '\n\nTotal data size:', len(X) + len(X_bts)
+    
+          , '\n\nTrain data size:', X.shape
          , '\ny_train numbers:', yc1
-          , '\ny_train ratio:',yc1_ratio
-          , '\n'
+    
+          , '\n\nTest data size:', X_bts.shape
          , '\ny_test_numbers:', yc2
+    
+          , '\n\ny_train ratio:',yc1_ratio
          , '\ny_test ratio:', yc2_ratio
          , '\n-------------------------------------------------------------'
          )
@ -775,3 +806,8 @@ def setvars(gene,drug):
    
    ###############################################################################
    # TODO: Find over and undersampling JUST for categorical data
+        ###########################################################################
+    
+    print('\n#################################################################'
+          , '\nDim of X for gene:', gene.lower(), '\n',  X.shape
+          , '\n###############################################################')