ML_AI_training/temp.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun May 29 09:22:51 2022

@author: tanu
"""

geneL_basic     = ['pncA']
geneL_na        = ['gid']
geneL_na_ppi2   = ['rpoB']
geneL_ppi2      = ['alr', 'embB', 'katG']
#%% get cols
mycols = my_df.columns

# # change from numberic to
# num_type = ['int64', 'float64']
# cat_type = ['object', 'bool']

# if my_df['active_aa_pos'].dtype in num_type:
#     my_df['active_aa_pos'] = my_df['active_aa_pos'].astype(object)
#     my_df['active_aa_pos'].dtype

# FIXME: if this is not structural, remove from source..
# Drop NA where numerical cols have them
if gene.lower() in geneL_na_ppi2:
    #D1148 get rid of
    na_index = my_df['mutationinformation'].index[my_df['mcsm_na_affinity'].apply(np.isnan)]
    my_df = my_df.drop(index=na_index)

# FIXME: either impute or remove!
# for embb (L114M, F115L, V123L, V125I, V131M) delete for now
if gene.lower() in ['embb']:
    na_index = my_df['mutationinformation'].index[my_df['ligand_distance'].apply(np.isnan)]
    #my_df = my_df.drop(index=na_index))# RERUN embb with the 5 values now present
#%%===========================================================================

#%%
# GET X
common_cols_stabiltyN = ['ligand_distance'
           , 'ligand_affinity_change'
           , 'duet_stability_change'
           , 'ddg_foldx'
           , 'deepddg'
           , 'ddg_dynamut2'
           , 'contacts']

# Build stability columns ~ gene
if gene.lower() in geneL_basic:
    x_stabilityN = common_cols_stabiltyN
    cols_to_mask = ['ligand_affinity_change']

if gene.lower() in geneL_ppi2:
#    x_stabilityN = common_cols_stabiltyN + ['mcsm_ppi2_affinity' , 'interface_dist']
    geneL_ppi2_st_cols = ['mcsm_ppi2_affinity', 'interface_dist']
    x_stabilityN = common_cols_stabiltyN + geneL_ppi2_st_cols
    cols_to_mask = ['ligand_affinity_change', 'mcsm_ppi2_affinity']

if gene.lower() in geneL_na:
#    x_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity']
    geneL_na_st_cols =  ['mcsm_na_affinity']
    x_stabilityN = common_cols_stabiltyN + geneL_na_st_cols
    cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity']

if gene.lower() in geneL_na_ppi2:
#    x_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist']
    geneL_na_ppi2_st_cols = ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist']
    x_stabilityN = common_cols_stabiltyN + geneL_na_ppi2_st_cols
    cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity', 'mcsm_ppi2_affinity']


#%% Masking columns (mCSM-lig, mCSM-NA, mCSM-ppi2) values for lig_dist >10
my_df_ml['mutationinformation'][my_df_ml['ligand_distance']>10].value_counts()
my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts()
my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask].value_counts()

# mask the column ligand distance > 10
my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask] = 0
(my_df_ml['ligand_affinity_change'] == 0).sum()

mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask]


for i in range(len(cols_to_mask)):
    ind = i+1
    print('\nindex:', i, '\nind:', ind)
    print('\nMask count check:'
          , (my_df_ml[cols_to_mask[i]]==0).sum() == (my_df_ml['ligand_distance']>10).sum()
          )

(my_df_ml[cols_to_mask[0]]==0).sum() == (my_df_ml['ligand_distance']>10).sum()
(my_df_ml[cols_to_mask[1]]==0).sum() == (my_df_ml['ligand_distance']>10).sum()