working on dissected model, testing diff feature groups

2022-06-20 21:51:07 +01:00 · 2022-06-20 21:51:07 +01:00 · e68a153883
commit e68a153883
parent 135efcee41
4 changed files with 270 additions and 161 deletions
--- a/scripts/ml/ml_data_dissected.py
+++ b/scripts/ml/ml_data_dissected.py
@ -417,125 +417,37 @@ else:
 #---------------------------------------
 #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

-#%%########################################################################
+#%% Data for ML ###############################################################
 #==========================
 #     Data for ML
 #==========================
 my_df_ml = my_df.copy()

-#%% Build X: input for ML
-common_cols_stabiltyN = ['ligand_distance'
-           , 'ligand_affinity_change'
-           , 'duet_stability_change'
-           , 'ddg_foldx'
-           , 'deepddg'
-           , 'ddg_dynamut2'
-           , 'mmcsm_lig'
-           , 'contacts']
-
-# Build stability columns ~ gene
+# Build column names to mask for affinity chanhes
 if gene.lower() in geneL_basic:
-    X_stabilityN = common_cols_stabiltyN
+    #X_stabilityN = common_cols_stabiltyN
+    gene_affinity_colnames = []# not needed as its a common one
    cols_to_mask = ['ligand_affinity_change']
    
 if gene.lower() in geneL_ppi2:
-#    X_stabilityN = common_cols_stabiltyN + ['mcsm_ppi2_affinity' , 'interface_dist'] 
-    geneL_ppi2_st_cols = ['mcsm_ppi2_affinity', 'interface_dist'] 
-    X_stabilityN = common_cols_stabiltyN + geneL_ppi2_st_cols
+    gene_affinity_colnames = ['mcsm_ppi2_affinity', 'interface_dist'] 
+    #X_stabilityN = common_cols_stabiltyN + geneL_ppi2_st_cols
    cols_to_mask = ['ligand_affinity_change', 'mcsm_ppi2_affinity']

 if gene.lower() in geneL_na:
-#    X_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] 
-    geneL_na_st_cols =  ['mcsm_na_affinity'] 
-    X_stabilityN = common_cols_stabiltyN + geneL_na_st_cols
+    gene_affinity_colnames =  ['mcsm_na_affinity'] 
+    #X_stabilityN = common_cols_stabiltyN + geneL_na_st_cols
    cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity']

 if gene.lower() in geneL_na_ppi2:
-#    X_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] 
-    geneL_na_ppi2_st_cols = ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] 
-    X_stabilityN = common_cols_stabiltyN + geneL_na_ppi2_st_cols
+    gene_affinity_colnames = ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] 
+    #X_stabilityN = common_cols_stabiltyN + geneL_na_ppi2_st_cols
    cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity', 'mcsm_ppi2_affinity']

-
-X_foldX_cols = [ 'electro_rr', 'electro_mm', 'electro_sm', 'electro_ss'
-, 'disulfide_rr', 'disulfide_mm', 'disulfide_sm', 'disulfide_ss'
-, 'hbonds_rr', 'hbonds_mm', 'hbonds_sm', 'hbonds_ss'
-, 'partcov_rr', 'partcov_mm', 'partcov_sm', 'partcov_ss'
-, 'vdwclashes_rr', 'vdwclashes_mm', 'vdwclashes_sm', 'vdwclashes_ss'
-, 'volumetric_rr', 'volumetric_mm', 'volumetric_ss'
-]
-
-X_str =  ['rsa'
-           #, 'asa'
-           , 'kd_values'
-           , 'rd_values']    
-
-X_ssFN = X_stabilityN + X_str + X_foldX_cols
-
-X_evolFN =  ['consurf_score'
-           , 'snap2_score'
-           , 'provean_score']
-    
-X_genomic_mafor =  ['maf'
-                , 'logorI'
-                # , 'or_rawI'
-                # , 'or_mychisq'
-                # , 'or_logistic'
-                # , 'or_fisher'
-                # , 'pval_fisher'
-                ]
-
-X_genomic_linegae  = ['lineage_proportion'
-                      , 'dist_lineage_proportion'
-                      #, 'lineage' # could be included as a category but it has L2;L4  formatting
-                      , 'lineage_count_all'
-                      , 'lineage_count_unique'
-                      ]
-
-X_genomicFN = X_genomic_mafor + X_genomic_linegae
-
-#X_aaindexFN = list(aa_df_cols)
-
-#print('\nTotal no. of features for aaindex:', len(X_aaindexFN))
-
-# numerical feature names [NO aa_index]
-numerical_FN = X_ssFN  + X_evolFN + X_genomicFN
-
-
-# categorical feature names
-categorical_FN = ['ss_class'
-            # , 'wt_prop_water'
-            # , 'mut_prop_water'
-            # , 'wt_prop_polarity'
-            # , 'mut_prop_polarity'
-            # , 'wt_calcprop'
-            # , 'mut_calcprop'
-            , 'aa_prop_change'
-            , 'electrostatics_change'
-            , 'polarity_change'
-            , 'water_change'
-            , 'drtype_mode_labels' # beware then you can't use it to predict [USED it for uq_v1, not v2]
-            , 'active_site' #[didn't use it for uq_v1]
-            #, 'gene_name' # will be required for the combined stuff
-             ]
-#----------------------------------------------
-# count numerical and categorical features
-#----------------------------------------------
-
-print('\nNo. of numerical features:', len(numerical_FN)
-      , '\nNo. of categorical features:', len(categorical_FN))
-
-###########################################################################
 #=======================
 # Masking columns:
 # (mCSM-lig, mCSM-NA, mCSM-ppi2) values for lig_dist >10
 #=======================
-# my_df_ml['mutationinformation'][my_df['ligand_distance']>10].value_counts()
-# my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts()
-
-# my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), 'ligand_affinity_change'] = 0
-# (my_df_ml['ligand_affinity_change'] == 0).sum()
-
 my_df_ml['mutationinformation'][my_df_ml['ligand_distance']>10].value_counts()
 my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts()
 my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask].value_counts()
@ -546,16 +458,139 @@ my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask] = 0

 mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask]  

+#===================================================
 # write file for check
 mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
 mask_check.to_csv(outdir + 'ml/' + gene.lower() + '_mask_check.csv')
-
 #===================================================
+###############################################################################
+#%% Feature groups (FG): Build X for Input ML 
+############################################################################
+#===========================
+# FG1: Evolutionary features
+#===========================
+X_evolFN =  ['consurf_score'
+           , 'snap2_score'
+           , 'provean_score']
+
+###############################################################################
+#========================
+# FG2: Stability features
+#========================
+#--------
+# common
+#--------
+X_common_stability_Fnum = [
+           'duet_stability_change'
+           , 'ddg_foldx'
+           , 'deepddg'
+           , 'ddg_dynamut2'
+           , 'mmcsm_lig'
+           , 'contacts']
+#--------
+# FoldX
+#--------
+X_foldX_Fnum = [ 'electro_rr', 'electro_mm', 'electro_sm', 'electro_ss'
+, 'disulfide_rr', 'disulfide_mm', 'disulfide_sm', 'disulfide_ss'
+, 'hbonds_rr', 'hbonds_mm', 'hbonds_sm', 'hbonds_ss'
+, 'partcov_rr', 'partcov_mm', 'partcov_sm', 'partcov_ss'
+, 'vdwclashes_rr', 'vdwclashes_mm', 'vdwclashes_sm', 'vdwclashes_ss'
+, 'volumetric_rr', 'volumetric_mm', 'volumetric_ss']
+
+X_stability_FN = X_common_stability_Fnum + X_foldX_Fnum
+
+###############################################################################
+#===================
+# FG3: Affinity features
+#===================
+common_affinity_Fnum =  ['ligand_distance'
+                , 'ligand_affinity_change']
+
+# if gene.lower() in geneL_basic:
+#     X_affinityFN = common_affinity_Fnum 
+# else:
+#     X_affinityFN = common_affinity_Fnum + gene_affinity_colnames
+    
+X_affinityFN = common_affinity_Fnum + gene_affinity_colnames
+
+###############################################################################
+#============================
+# FG4: Residue level features
+#============================
+#-----------
+# AA index
+#-----------
+X_aaindex_Fnum = list(aa_df_cols)
+print('\nTotal no. of features for aaindex:', len(X_aaindex_Fnum))
+
+#-----------------
+# surface area
+# depth
+# hydrophobicity
+#-----------------
+X_str_Fnum =  ['rsa'
+           #, 'asa'
+           , 'kd_values'
+           , 'rd_values']   
+
+#---------------------------
+# Other aa properties
+# active site indication
+#---------------------------
+X_aap_Fcat = ['ss_class'
+            # , 'wt_prop_water'
+            # , 'mut_prop_water'
+            # , 'wt_prop_polarity'
+            # , 'mut_prop_polarity'
+            # , 'wt_calcprop'
+            # , 'mut_calcprop'
+            , 'aa_prop_change'
+            , 'electrostatics_change'
+            , 'polarity_change'
+            , 'water_change'
+            , 'active_site']
+
+
+X_resprop_FN = X_aaindex_Fnum + X_str_Fnum + X_aap_Fcat
+###############################################################################
+#========================
+# FG5: Genomic features
+#========================
+X_gn_mafor_Fnum =  ['maf'
+                , 'logorI'
+                # , 'or_rawI'
+                # , 'or_mychisq'
+                # , 'or_logistic'
+                # , 'or_fisher'
+                # , 'pval_fisher'
+                ]
+
+X_gn_linegae_Fnum  = ['lineage_proportion'
+                      , 'dist_lineage_proportion'
+                      #, 'lineage' # could be included as a category but it has L2;L4  formatting
+                      , 'lineage_count_all'
+                      , 'lineage_count_unique'
+                      ]
+
+X_gn_Fcat = ['drtype_mode_labels'  # beware then you can't use it to predict [USED it for uq_v1, not v2]
+               #, 'gene_name' # will be required for the combined stuff
+             ]
+
+X_genomicFN = X_gn_mafor_Fnum + X_gn_linegae_Fnum + X_gn_Fcat
+###############################################################################
+# Feature groups further collaps:
+X_structural_FN =  X_stability_FN + X_affinityFN + X_resprop_FN
+
+all_featuresN = X_evolFN + X_structural_FN + X_genomicFN
+
+###############################################################################
+#%% Define training and test data
+#======================================================
 # Training and BLIND test set [UQ]: actual vs imputed
 # No aa index but active_site included
 # dst with actual values  : training set
 # dst with imputed values : blind test
-#==================================================
+#======================================================
 my_df_ml[drug].isna().sum()  #'na' ones are the blind_test set

 blind_test_df = my_df_ml[my_df_ml[drug].isna()]
@ -567,6 +602,7 @@ training_df.shape
 # Target 1: dst_mode
 training_df[drug].value_counts()
 training_df['dst_mode'].value_counts()
+
 ####################################################################
 #============
 # ML data
@ -574,8 +610,8 @@ training_df['dst_mode'].value_counts()
 #------
 # X: Training and Blind test (BTS)
 #------
-X     = training_df[numerical_FN + categorical_FN] 
-X_bts = blind_test_df[numerical_FN + categorical_FN] 
+X     = training_df[all_featuresN] 
+X_bts = blind_test_df[all_featuresN] 

 #------
 # y
@ -601,19 +637,67 @@ yc1_ratio = yc1[0]/yc1[1]
 yc2 = Counter(y_bts)
 yc2_ratio = yc2[0]/yc2[1]

+###############################################################################
+#======================================================
+# Determine categorical and numerical features
+#======================================================
+numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
+numerical_cols 
+categorical_cols = X.select_dtypes(include=['object', 'bool']).columns
+categorical_cols 
+
+################################################################################
+# IMPORTANT sanity checks
+if len(X.columns) == len(X_evolFN) + len(X_stability_FN) + len(X_affinityFN) + len(X_resprop_FN) + len(X_genomicFN):
+    print('\nPASS: ML data with input features, training and test generated...'
+          , '\n\nTotal no. of input features:'        , len(X.columns)
+          , '\n--------No. of numerical features:'    , len(numerical_cols)
+          , '\n--------No. of categorical features:'  , len(categorical_cols)
+          
+          , '\n\nTotal no. of evolutionary features:' , len(X_evolFN)
+          
+          , '\n\nTotal no. of stability features:'    , len(X_stability_FN)
+          , '\n--------Common stabilty cols:'         , len(X_common_stability_Fnum)
+          , '\n--------Foldx cols:'                   , len(X_foldX_Fnum)
+          
+          , '\n\nTotal no. of affinity features:'     , len(X_affinityFN)
+          , '\n--------Common affinity cols:'         , len(common_affinity_Fnum)
+          , '\n--------Gene specific affinity cols:'     , len(gene_affinity_colnames)
+          
+          , '\n\nTotal no. of residue level features:', len(X_resprop_FN)
+          , '\n--------AA index cols:'                , len(X_aaindex_Fnum)
+          , '\n--------Residue Prop cols:'            , len(X_str_Fnum)
+          , '\n--------AA change Prop cols:'          , len(X_aap_Fcat)
+          
+          , '\n\nTotal no. of genomic features:'      , len(X_genomicFN)
+          , '\n--------MAF+OR cols:'                  , len(X_gn_mafor_Fnum)
+          , '\n--------Lineage cols:'                 , len(X_gn_linegae_Fnum)
+          , '\n--------Other cols:'                   , len(X_gn_Fcat)
+          )
+else:
+    print('\nFAIL: numbers mismatch'
+          , '\nExpected:',len(X_evolFN) + len(X_stability_FN) + len(X_affinityFN) + len(X_resprop_FN) + len(X_genomicFN)
+          , '\nGot:', len(X.columns))
+    sys.exit()
+###############################################################################
 print('\n-------------------------------------------------------------'
-      , '\nSuccessfully split data: UQ [no aa_index but active site included] training'
+      , '\nSuccessfully split data: ALL features'
      , '\nactual values: training set'
      , '\nimputed values: blind test set'
-      , '\nTrain data size:', X.shape
-      , '\nTest data size:', X_bts.shape
+      
+      , '\n\nTotal data size:', len(X) + len(X_bts)
+
+      , '\n\nTrain data size:', X.shape
      , '\ny_train numbers:', yc1
-      , '\ny_train ratio:',yc1_ratio
-      , '\n'
+
+      , '\n\nTest data size:', X_bts.shape
      , '\ny_test_numbers:', yc2
+
+      , '\n\ny_train ratio:',yc1_ratio
      , '\ny_test ratio:', yc2_ratio
      , '\n-------------------------------------------------------------'
      )
+
 ###########################################################################
 #%% 
 ###########################################################################