92 lines
3.4 KiB
Python
Executable file
92 lines
3.4 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
Created on Sun May 29 09:22:51 2022
|
|
|
|
@author: tanu
|
|
"""
|
|
|
|
geneL_basic = ['pncA']
|
|
geneL_na = ['gid']
|
|
geneL_na_ppi2 = ['rpoB']
|
|
geneL_ppi2 = ['alr', 'embB', 'katG']
|
|
#%% get cols
|
|
mycols = my_df.columns
|
|
|
|
# # change from numberic to
|
|
# num_type = ['int64', 'float64']
|
|
# cat_type = ['object', 'bool']
|
|
|
|
# if my_df['active_aa_pos'].dtype in num_type:
|
|
# my_df['active_aa_pos'] = my_df['active_aa_pos'].astype(object)
|
|
# my_df['active_aa_pos'].dtype
|
|
|
|
# FIXME: if this is not structural, remove from source..
|
|
# Drop NA where numerical cols have them
|
|
if gene.lower() in geneL_na_ppi2:
|
|
#D1148 get rid of
|
|
na_index = my_df['mutationinformation'].index[my_df['mcsm_na_affinity'].apply(np.isnan)]
|
|
my_df = my_df.drop(index=na_index)
|
|
|
|
# FIXME: either impute or remove!
|
|
# for embb (L114M, F115L, V123L, V125I, V131M) delete for now
|
|
if gene.lower() in ['embb']:
|
|
na_index = my_df['mutationinformation'].index[my_df['ligand_distance'].apply(np.isnan)]
|
|
#my_df = my_df.drop(index=na_index))# RERUN embb with the 5 values now present
|
|
#%%===========================================================================
|
|
|
|
#%%
|
|
# GET X
|
|
common_cols_stabiltyN = ['ligand_distance'
|
|
, 'ligand_affinity_change'
|
|
, 'duet_stability_change'
|
|
, 'ddg_foldx'
|
|
, 'deepddg'
|
|
, 'ddg_dynamut2'
|
|
, 'contacts']
|
|
|
|
# Build stability columns ~ gene
|
|
if gene.lower() in geneL_basic:
|
|
x_stabilityN = common_cols_stabiltyN
|
|
cols_to_mask = ['ligand_affinity_change']
|
|
|
|
if gene.lower() in geneL_ppi2:
|
|
# x_stabilityN = common_cols_stabiltyN + ['mcsm_ppi2_affinity' , 'interface_dist']
|
|
geneL_ppi2_st_cols = ['mcsm_ppi2_affinity', 'interface_dist']
|
|
x_stabilityN = common_cols_stabiltyN + geneL_ppi2_st_cols
|
|
cols_to_mask = ['ligand_affinity_change', 'mcsm_ppi2_affinity']
|
|
|
|
if gene.lower() in geneL_na:
|
|
# x_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity']
|
|
geneL_na_st_cols = ['mcsm_na_affinity']
|
|
x_stabilityN = common_cols_stabiltyN + geneL_na_st_cols
|
|
cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity']
|
|
|
|
if gene.lower() in geneL_na_ppi2:
|
|
# x_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist']
|
|
geneL_na_ppi2_st_cols = ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist']
|
|
x_stabilityN = common_cols_stabiltyN + geneL_na_ppi2_st_cols
|
|
cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity', 'mcsm_ppi2_affinity']
|
|
|
|
|
|
#%% Masking columns (mCSM-lig, mCSM-NA, mCSM-ppi2) values for lig_dist >10
|
|
my_df_ml['mutationinformation'][my_df_ml['ligand_distance']>10].value_counts()
|
|
my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts()
|
|
my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask].value_counts()
|
|
|
|
# mask the column ligand distance > 10
|
|
my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask] = 0
|
|
(my_df_ml['ligand_affinity_change'] == 0).sum()
|
|
|
|
mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask]
|
|
|
|
|
|
for i in range(len(cols_to_mask)):
|
|
ind = i+1
|
|
print('\nindex:', i, '\nind:', ind)
|
|
print('\nMask count check:'
|
|
, (my_df_ml[cols_to_mask[i]]==0).sum() == (my_df_ml['ligand_distance']>10).sum()
|
|
)
|
|
|
|
(my_df_ml[cols_to_mask[0]]==0).sum() == (my_df_ml['ligand_distance']>10).sum()
|
|
(my_df_ml[cols_to_mask[1]]==0).sum() == (my_df_ml['ligand_distance']>10).sum()
|