fixed masking condition for ML training data for genes and wrote revised mask files out

This commit is contained in:
Tanushree Tunstall 2022-07-27 13:36:16 +01:00
parent 0adf69f75a
commit f4cab1fdfb
3 changed files with 46 additions and 26 deletions

View file

@ -101,14 +101,15 @@ def getmldata(gene, drug
datadir = homedir + '/git/Data/'
indir = datadir + drug + '/input/'
outdir = datadir + drug + '/output/'
outdir_ml = outdir + 'ml/'
#outdir_ml = outdir + 'ml/'
outdir_ml = homedir + '/git/LSHTM_ML/output/'
#==========================
# outfile for ML training:
#==========================
outFile_ml = outdir_ml + gene.lower() + '_training_data.csv'
outFile_ml = outdir_ml + gene.lower() + '_training_data.csv'
outFile_mask_ml = outdir_ml + gene.lower() + '_mask_check.csv'
outFile_mask_ml = outdir_ml + 'genes/mask_check/' + gene.lower() + '_mask_check.csv'
#=======
# input
@ -436,41 +437,58 @@ def getmldata(gene, drug
#X_stabilityN = common_cols_stabiltyN
gene_affinity_colnames = []# not needed as its the common ones
cols_to_mask = ['ligand_affinity_change']
cols_to_mask_ppi2 = []
if gene.lower() in geneL_ppi2:
gene_affinity_colnames = ['mcsm_ppi2_affinity', 'interface_dist']
#X_stabilityN = common_cols_stabiltyN + geneL_ppi2_st_cols
cols_to_mask = ['ligand_affinity_change', 'mcsm_ppi2_affinity']
#cols_to_mask = ['ligand_affinity_change', 'mcsm_ppi2_affinity']
cols_to_mask = ['ligand_affinity_change']
cols_to_mask_ppi2 = ['mcsm_ppi2_affinity']
if gene.lower() in geneL_na:
gene_affinity_colnames = ['mcsm_na_affinity']
#X_stabilityN = common_cols_stabiltyN + geneL_na_st_cols
cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity']
cols_to_mask_ppi2 = []
if gene.lower() in geneL_na_ppi2:
gene_affinity_colnames = ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist']
#X_stabilityN = common_cols_stabiltyN + geneL_na_ppi2_st_cols
cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity', 'mcsm_ppi2_affinity']
#cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity', 'mcsm_ppi2_affinity']
cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity']
cols_to_mask_ppi2 = ['mcsm_ppi2_affinity']
#=======================
# Masking columns:
# (mCSM-lig, mCSM-NA, mCSM-ppi2) values for lig_dist >10
# Masking columns:
# lig_dist >10 ==> mCSM-lig AND mCSM-NA col values == 0
# interface_dist >10 ==> mCSM-ppi2 col values == 0
#=======================
my_df_ml['mutationinformation'][my_df_ml['ligand_distance']>10].value_counts()
my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts()
my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask].value_counts()
# mask the mcsm affinity related columns where ligand distance > 10
# mask the mcsm ligand affinity AND mcsm_na affinity columns where ligand distance > 10
my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask] = 0
(my_df_ml['ligand_affinity_change'] == 0).sum()
# mask the mcsm_ppi2_affinity column where interface_dist > 10
if len(cols_to_mask_ppi2) > 0:
my_df_ml.loc[(my_df_ml['interface_dist'] > 10), cols_to_mask_ppi2] = 0
add_cols_mask = ['interface_dist'] + cols_to_mask_ppi2
mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask + add_cols_mask]
else:
mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask ]
mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask]
# sanity check: check script SANITY_CHECK_mask.py
#===================================================
# write file for check
#mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
#mask_check.to_csv(outdir + 'ml/' + gene.lower() + '_mask_check.csv')
#===================================================
if write_maskfile:
# write mask file for sanity check
#mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
mask_check.to_csv(outdir_ml + gene.lower() + '_mask_check.csv')
###############################################################################
#%% Feature groups (FG): Build X for Input ML
############################################################################