From f9129b9ebcaccbdfed856c877853c2767c38d939 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Thu, 1 Sep 2022 12:55:38 +0100 Subject: [PATCH] added nca dist criteria for masking --- scripts/ml/ml_functions/GetMLData.py | 57 +++++++++++++++++++------ scripts/ml/ml_functions/GetMLData_v1.py | 57 ++++++------------------- 2 files changed, 57 insertions(+), 57 deletions(-) diff --git a/scripts/ml/ml_functions/GetMLData.py b/scripts/ml/ml_functions/GetMLData.py index bdbd70e..d5eab71 100755 --- a/scripts/ml/ml_functions/GetMLData.py +++ b/scripts/ml/ml_functions/GetMLData.py @@ -436,30 +436,35 @@ def getmldata(gene, drug if gene.lower() in geneL_basic: #X_stabilityN = common_cols_stabiltyN gene_affinity_colnames = []# not needed as its the common ones - cols_to_mask = ['ligand_affinity_change'] + cols_to_mask = ['ligand_affinity_change'] cols_to_mask_ppi2 = [] + cols_to_mask_na = [] if gene.lower() in geneL_ppi2: gene_affinity_colnames = ['mcsm_ppi2_affinity', 'interface_dist'] #X_stabilityN = common_cols_stabiltyN + geneL_ppi2_st_cols #cols_to_mask = ['ligand_affinity_change', 'mcsm_ppi2_affinity'] - cols_to_mask = ['ligand_affinity_change'] + cols_to_mask = ['ligand_affinity_change'] cols_to_mask_ppi2 = ['mcsm_ppi2_affinity'] + cols_to_mask_na = [] if gene.lower() in geneL_na: - gene_affinity_colnames = ['mcsm_na_affinity'] + gene_affinity_colnames = ['mcsm_na_affinity', 'nca_distance'] #X_stabilityN = common_cols_stabiltyN + geneL_na_st_cols - cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity'] - cols_to_mask_ppi2 = [] + cols_to_mask = ['ligand_affinity_change']#, 'mcsm_na_affinity'] + cols_to_mask_ppi2 = [] + cols_to_mask_na = ['mcsm_na_affinity'] - + # both if gene.lower() in geneL_na_ppi2: - gene_affinity_colnames = ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] + gene_affinity_colnames = ['mcsm_na_affinity','nca_distance', 'mcsm_ppi2_affinity', 'interface_dist'] #X_stabilityN = common_cols_stabiltyN + geneL_na_ppi2_st_cols #cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity', 'mcsm_ppi2_affinity'] - cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity'] + cols_to_mask = ['ligand_affinity_change']#, 'mcsm_na_affinity'] cols_to_mask_ppi2 = ['mcsm_ppi2_affinity'] + cols_to_mask_na = ['mcsm_na_affinity'] + #======================= # Masking columns: @@ -470,19 +475,47 @@ def getmldata(gene, drug my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts() my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask].value_counts() + #--------------------------- + # mask ligand affinity column where ligand distance > 10 + #--------------------------- # mask the mcsm ligand affinity AND mcsm_na affinity columns where ligand distance > 10 my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask] = 0 + #mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask] + mask_check_cols = ['mutationinformation', 'ligand_distance'] + cols_to_mask + + #--------------------------- # mask the mcsm_ppi2_affinity column where interface_dist > 10 + #--------------------------- if len(cols_to_mask_ppi2) > 0: my_df_ml.loc[(my_df_ml['interface_dist'] > 10), cols_to_mask_ppi2] = 0 add_cols_mask = ['interface_dist'] + cols_to_mask_ppi2 - mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask + add_cols_mask] - else: - mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask ] + #mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask + add_cols_mask] + mask_check_cols = mask_check_cols + add_cols_mask + + #--------------------------- + # mask the na_affinity column where nca_distance > 10 + #--------------------------- + if len(cols_to_mask_na) > 0: + my_df_ml.loc[(my_df_ml['nca_distance'] > 10), cols_to_mask_na] = 0 + add_cols_mask = ['nca_distance'] + cols_to_mask_na + #mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask + add_cols_mask] + mask_check_cols = mask_check_cols + add_cols_mask + + # if gene.lower() in geneL_na_ppi2: + # #--------------------------- + # # RPOB: mask ppi2 + na + lig cols + # #--------------------------- + # mask_check = my_df_ml[['mutationinformation', + # 'ligand_distance', 'ligand_affinity_change', + # 'nca_distance','mcsm_na_affinity', + # 'mcsm_ppi2_affinity','interface_dist']] + + # GET mask data + mask_check = my_df_ml[mask_check_cols] + # sanity check: check script SANITY_CHECK_mask.py - if write_maskfile: # write mask file for sanity check #mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True) diff --git a/scripts/ml/ml_functions/GetMLData_v1.py b/scripts/ml/ml_functions/GetMLData_v1.py index d5eab71..bdbd70e 100755 --- a/scripts/ml/ml_functions/GetMLData_v1.py +++ b/scripts/ml/ml_functions/GetMLData_v1.py @@ -436,35 +436,30 @@ def getmldata(gene, drug if gene.lower() in geneL_basic: #X_stabilityN = common_cols_stabiltyN gene_affinity_colnames = []# not needed as its the common ones - cols_to_mask = ['ligand_affinity_change'] + cols_to_mask = ['ligand_affinity_change'] cols_to_mask_ppi2 = [] - cols_to_mask_na = [] if gene.lower() in geneL_ppi2: gene_affinity_colnames = ['mcsm_ppi2_affinity', 'interface_dist'] #X_stabilityN = common_cols_stabiltyN + geneL_ppi2_st_cols #cols_to_mask = ['ligand_affinity_change', 'mcsm_ppi2_affinity'] - cols_to_mask = ['ligand_affinity_change'] + cols_to_mask = ['ligand_affinity_change'] cols_to_mask_ppi2 = ['mcsm_ppi2_affinity'] - cols_to_mask_na = [] if gene.lower() in geneL_na: - gene_affinity_colnames = ['mcsm_na_affinity', 'nca_distance'] + gene_affinity_colnames = ['mcsm_na_affinity'] #X_stabilityN = common_cols_stabiltyN + geneL_na_st_cols - cols_to_mask = ['ligand_affinity_change']#, 'mcsm_na_affinity'] - cols_to_mask_ppi2 = [] - cols_to_mask_na = ['mcsm_na_affinity'] + cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity'] + cols_to_mask_ppi2 = [] - # both + if gene.lower() in geneL_na_ppi2: - gene_affinity_colnames = ['mcsm_na_affinity','nca_distance', 'mcsm_ppi2_affinity', 'interface_dist'] + gene_affinity_colnames = ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] #X_stabilityN = common_cols_stabiltyN + geneL_na_ppi2_st_cols #cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity', 'mcsm_ppi2_affinity'] - cols_to_mask = ['ligand_affinity_change']#, 'mcsm_na_affinity'] + cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity'] cols_to_mask_ppi2 = ['mcsm_ppi2_affinity'] - cols_to_mask_na = ['mcsm_na_affinity'] - #======================= # Masking columns: @@ -475,47 +470,19 @@ def getmldata(gene, drug my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts() my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask].value_counts() - #--------------------------- - # mask ligand affinity column where ligand distance > 10 - #--------------------------- # mask the mcsm ligand affinity AND mcsm_na affinity columns where ligand distance > 10 my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask] = 0 - #mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask] - mask_check_cols = ['mutationinformation', 'ligand_distance'] + cols_to_mask - - #--------------------------- # mask the mcsm_ppi2_affinity column where interface_dist > 10 - #--------------------------- if len(cols_to_mask_ppi2) > 0: my_df_ml.loc[(my_df_ml['interface_dist'] > 10), cols_to_mask_ppi2] = 0 add_cols_mask = ['interface_dist'] + cols_to_mask_ppi2 - #mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask + add_cols_mask] - mask_check_cols = mask_check_cols + add_cols_mask - - #--------------------------- - # mask the na_affinity column where nca_distance > 10 - #--------------------------- - if len(cols_to_mask_na) > 0: - my_df_ml.loc[(my_df_ml['nca_distance'] > 10), cols_to_mask_na] = 0 - add_cols_mask = ['nca_distance'] + cols_to_mask_na - #mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask + add_cols_mask] - mask_check_cols = mask_check_cols + add_cols_mask - - # if gene.lower() in geneL_na_ppi2: - # #--------------------------- - # # RPOB: mask ppi2 + na + lig cols - # #--------------------------- - # mask_check = my_df_ml[['mutationinformation', - # 'ligand_distance', 'ligand_affinity_change', - # 'nca_distance','mcsm_na_affinity', - # 'mcsm_ppi2_affinity','interface_dist']] - + mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask + add_cols_mask] + else: + mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask ] - # GET mask data - mask_check = my_df_ml[mask_check_cols] - # sanity check: check script SANITY_CHECK_mask.py + if write_maskfile: # write mask file for sanity check #mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)