ML_AI_training/temp.py

92 lines
3.4 KiB
Python
Executable file

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun May 29 09:22:51 2022
@author: tanu
"""
geneL_basic = ['pncA']
geneL_na = ['gid']
geneL_na_ppi2 = ['rpoB']
geneL_ppi2 = ['alr', 'embB', 'katG']
#%% get cols
mycols = my_df.columns
# # change from numberic to
# num_type = ['int64', 'float64']
# cat_type = ['object', 'bool']
# if my_df['active_aa_pos'].dtype in num_type:
# my_df['active_aa_pos'] = my_df['active_aa_pos'].astype(object)
# my_df['active_aa_pos'].dtype
# FIXME: if this is not structural, remove from source..
# Drop NA where numerical cols have them
if gene.lower() in geneL_na_ppi2:
#D1148 get rid of
na_index = my_df['mutationinformation'].index[my_df['mcsm_na_affinity'].apply(np.isnan)]
my_df = my_df.drop(index=na_index)
# FIXME: either impute or remove!
# for embb (L114M, F115L, V123L, V125I, V131M) delete for now
if gene.lower() in ['embb']:
na_index = my_df['mutationinformation'].index[my_df['ligand_distance'].apply(np.isnan)]
#my_df = my_df.drop(index=na_index))# RERUN embb with the 5 values now present
#%%===========================================================================
#%%
# GET X
common_cols_stabiltyN = ['ligand_distance'
, 'ligand_affinity_change'
, 'duet_stability_change'
, 'ddg_foldx'
, 'deepddg'
, 'ddg_dynamut2'
, 'contacts']
# Build stability columns ~ gene
if gene.lower() in geneL_basic:
x_stabilityN = common_cols_stabiltyN
cols_to_mask = ['ligand_affinity_change']
if gene.lower() in geneL_ppi2:
# x_stabilityN = common_cols_stabiltyN + ['mcsm_ppi2_affinity' , 'interface_dist']
geneL_ppi2_st_cols = ['mcsm_ppi2_affinity', 'interface_dist']
x_stabilityN = common_cols_stabiltyN + geneL_ppi2_st_cols
cols_to_mask = ['ligand_affinity_change', 'mcsm_ppi2_affinity']
if gene.lower() in geneL_na:
# x_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity']
geneL_na_st_cols = ['mcsm_na_affinity']
x_stabilityN = common_cols_stabiltyN + geneL_na_st_cols
cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity']
if gene.lower() in geneL_na_ppi2:
# x_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist']
geneL_na_ppi2_st_cols = ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist']
x_stabilityN = common_cols_stabiltyN + geneL_na_ppi2_st_cols
cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity', 'mcsm_ppi2_affinity']
#%% Masking columns (mCSM-lig, mCSM-NA, mCSM-ppi2) values for lig_dist >10
my_df_ml['mutationinformation'][my_df_ml['ligand_distance']>10].value_counts()
my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts()
my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask].value_counts()
# mask the column ligand distance > 10
my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask] = 0
(my_df_ml['ligand_affinity_change'] == 0).sum()
mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask]
for i in range(len(cols_to_mask)):
ind = i+1
print('\nindex:', i, '\nind:', ind)
print('\nMask count check:'
, (my_df_ml[cols_to_mask[i]]==0).sum() == (my_df_ml['ligand_distance']>10).sum()
)
(my_df_ml[cols_to_mask[0]]==0).sum() == (my_df_ml['ligand_distance']>10).sum()
(my_df_ml[cols_to_mask[1]]==0).sum() == (my_df_ml['ligand_distance']>10).sum()