added nca dist criteria for masking

This commit is contained in:
Tanushree Tunstall 2022-09-01 12:55:38 +01:00
parent f94eadf1d4
commit f9129b9ebc
2 changed files with 57 additions and 57 deletions

View file

@ -436,30 +436,35 @@ def getmldata(gene, drug
if gene.lower() in geneL_basic:
#X_stabilityN = common_cols_stabiltyN
gene_affinity_colnames = []# not needed as its the common ones
cols_to_mask = ['ligand_affinity_change']
cols_to_mask = ['ligand_affinity_change']
cols_to_mask_ppi2 = []
cols_to_mask_na = []
if gene.lower() in geneL_ppi2:
gene_affinity_colnames = ['mcsm_ppi2_affinity', 'interface_dist']
#X_stabilityN = common_cols_stabiltyN + geneL_ppi2_st_cols
#cols_to_mask = ['ligand_affinity_change', 'mcsm_ppi2_affinity']
cols_to_mask = ['ligand_affinity_change']
cols_to_mask = ['ligand_affinity_change']
cols_to_mask_ppi2 = ['mcsm_ppi2_affinity']
cols_to_mask_na = []
if gene.lower() in geneL_na:
gene_affinity_colnames = ['mcsm_na_affinity']
gene_affinity_colnames = ['mcsm_na_affinity', 'nca_distance']
#X_stabilityN = common_cols_stabiltyN + geneL_na_st_cols
cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity']
cols_to_mask_ppi2 = []
cols_to_mask = ['ligand_affinity_change']#, 'mcsm_na_affinity']
cols_to_mask_ppi2 = []
cols_to_mask_na = ['mcsm_na_affinity']
# both
if gene.lower() in geneL_na_ppi2:
gene_affinity_colnames = ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist']
gene_affinity_colnames = ['mcsm_na_affinity','nca_distance', 'mcsm_ppi2_affinity', 'interface_dist']
#X_stabilityN = common_cols_stabiltyN + geneL_na_ppi2_st_cols
#cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity', 'mcsm_ppi2_affinity']
cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity']
cols_to_mask = ['ligand_affinity_change']#, 'mcsm_na_affinity']
cols_to_mask_ppi2 = ['mcsm_ppi2_affinity']
cols_to_mask_na = ['mcsm_na_affinity']
#=======================
# Masking columns:
@ -470,19 +475,47 @@ def getmldata(gene, drug
my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts()
my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask].value_counts()
#---------------------------
# mask ligand affinity column where ligand distance > 10
#---------------------------
# mask the mcsm ligand affinity AND mcsm_na affinity columns where ligand distance > 10
my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask] = 0
#mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask]
mask_check_cols = ['mutationinformation', 'ligand_distance'] + cols_to_mask
#---------------------------
# mask the mcsm_ppi2_affinity column where interface_dist > 10
#---------------------------
if len(cols_to_mask_ppi2) > 0:
my_df_ml.loc[(my_df_ml['interface_dist'] > 10), cols_to_mask_ppi2] = 0
add_cols_mask = ['interface_dist'] + cols_to_mask_ppi2
mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask + add_cols_mask]
else:
mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask ]
#mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask + add_cols_mask]
mask_check_cols = mask_check_cols + add_cols_mask
#---------------------------
# mask the na_affinity column where nca_distance > 10
#---------------------------
if len(cols_to_mask_na) > 0:
my_df_ml.loc[(my_df_ml['nca_distance'] > 10), cols_to_mask_na] = 0
add_cols_mask = ['nca_distance'] + cols_to_mask_na
#mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask + add_cols_mask]
mask_check_cols = mask_check_cols + add_cols_mask
# if gene.lower() in geneL_na_ppi2:
# #---------------------------
# # RPOB: mask ppi2 + na + lig cols
# #---------------------------
# mask_check = my_df_ml[['mutationinformation',
# 'ligand_distance', 'ligand_affinity_change',
# 'nca_distance','mcsm_na_affinity',
# 'mcsm_ppi2_affinity','interface_dist']]
# GET mask data
mask_check = my_df_ml[mask_check_cols]
# sanity check: check script SANITY_CHECK_mask.py
if write_maskfile:
# write mask file for sanity check
#mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)

View file

@ -436,35 +436,30 @@ def getmldata(gene, drug
if gene.lower() in geneL_basic:
#X_stabilityN = common_cols_stabiltyN
gene_affinity_colnames = []# not needed as its the common ones
cols_to_mask = ['ligand_affinity_change']
cols_to_mask = ['ligand_affinity_change']
cols_to_mask_ppi2 = []
cols_to_mask_na = []
if gene.lower() in geneL_ppi2:
gene_affinity_colnames = ['mcsm_ppi2_affinity', 'interface_dist']
#X_stabilityN = common_cols_stabiltyN + geneL_ppi2_st_cols
#cols_to_mask = ['ligand_affinity_change', 'mcsm_ppi2_affinity']
cols_to_mask = ['ligand_affinity_change']
cols_to_mask = ['ligand_affinity_change']
cols_to_mask_ppi2 = ['mcsm_ppi2_affinity']
cols_to_mask_na = []
if gene.lower() in geneL_na:
gene_affinity_colnames = ['mcsm_na_affinity', 'nca_distance']
gene_affinity_colnames = ['mcsm_na_affinity']
#X_stabilityN = common_cols_stabiltyN + geneL_na_st_cols
cols_to_mask = ['ligand_affinity_change']#, 'mcsm_na_affinity']
cols_to_mask_ppi2 = []
cols_to_mask_na = ['mcsm_na_affinity']
cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity']
cols_to_mask_ppi2 = []
# both
if gene.lower() in geneL_na_ppi2:
gene_affinity_colnames = ['mcsm_na_affinity','nca_distance', 'mcsm_ppi2_affinity', 'interface_dist']
gene_affinity_colnames = ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist']
#X_stabilityN = common_cols_stabiltyN + geneL_na_ppi2_st_cols
#cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity', 'mcsm_ppi2_affinity']
cols_to_mask = ['ligand_affinity_change']#, 'mcsm_na_affinity']
cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity']
cols_to_mask_ppi2 = ['mcsm_ppi2_affinity']
cols_to_mask_na = ['mcsm_na_affinity']
#=======================
# Masking columns:
@ -475,47 +470,19 @@ def getmldata(gene, drug
my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts()
my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask].value_counts()
#---------------------------
# mask ligand affinity column where ligand distance > 10
#---------------------------
# mask the mcsm ligand affinity AND mcsm_na affinity columns where ligand distance > 10
my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask] = 0
#mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask]
mask_check_cols = ['mutationinformation', 'ligand_distance'] + cols_to_mask
#---------------------------
# mask the mcsm_ppi2_affinity column where interface_dist > 10
#---------------------------
if len(cols_to_mask_ppi2) > 0:
my_df_ml.loc[(my_df_ml['interface_dist'] > 10), cols_to_mask_ppi2] = 0
add_cols_mask = ['interface_dist'] + cols_to_mask_ppi2
#mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask + add_cols_mask]
mask_check_cols = mask_check_cols + add_cols_mask
#---------------------------
# mask the na_affinity column where nca_distance > 10
#---------------------------
if len(cols_to_mask_na) > 0:
my_df_ml.loc[(my_df_ml['nca_distance'] > 10), cols_to_mask_na] = 0
add_cols_mask = ['nca_distance'] + cols_to_mask_na
#mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask + add_cols_mask]
mask_check_cols = mask_check_cols + add_cols_mask
# if gene.lower() in geneL_na_ppi2:
# #---------------------------
# # RPOB: mask ppi2 + na + lig cols
# #---------------------------
# mask_check = my_df_ml[['mutationinformation',
# 'ligand_distance', 'ligand_affinity_change',
# 'nca_distance','mcsm_na_affinity',
# 'mcsm_ppi2_affinity','interface_dist']]
mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask + add_cols_mask]
else:
mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask ]
# GET mask data
mask_check = my_df_ml[mask_check_cols]
# sanity check: check script SANITY_CHECK_mask.py
if write_maskfile:
# write mask file for sanity check
#mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)