#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Sun May 29 09:22:51 2022 @author: tanu """ geneL_basic = ['pncA'] geneL_na = ['gid'] geneL_na_ppi2 = ['rpoB'] geneL_ppi2 = ['alr', 'embB', 'katG'] #%% get cols mycols = my_df.columns # # change from numberic to # num_type = ['int64', 'float64'] # cat_type = ['object', 'bool'] # if my_df['active_aa_pos'].dtype in num_type: # my_df['active_aa_pos'] = my_df['active_aa_pos'].astype(object) # my_df['active_aa_pos'].dtype # FIXME: if this is not structural, remove from source.. # Drop NA where numerical cols have them if gene.lower() in geneL_na_ppi2: #D1148 get rid of na_index = my_df['mutationinformation'].index[my_df['mcsm_na_affinity'].apply(np.isnan)] my_df = my_df.drop(index=na_index) # FIXME: either impute or remove! # for embb (L114M, F115L, V123L, V125I, V131M) delete for now if gene.lower() in ['embb']: na_index = my_df['mutationinformation'].index[my_df['ligand_distance'].apply(np.isnan)] #my_df = my_df.drop(index=na_index))# RERUN embb with the 5 values now present #%%=========================================================================== #%% # GET X common_cols_stabiltyN = ['ligand_distance' , 'ligand_affinity_change' , 'duet_stability_change' , 'ddg_foldx' , 'deepddg' , 'ddg_dynamut2' , 'contacts'] # Build stability columns ~ gene if gene.lower() in geneL_basic: x_stabilityN = common_cols_stabiltyN cols_to_mask = ['ligand_affinity_change'] if gene.lower() in geneL_ppi2: # x_stabilityN = common_cols_stabiltyN + ['mcsm_ppi2_affinity' , 'interface_dist'] geneL_ppi2_st_cols = ['mcsm_ppi2_affinity', 'interface_dist'] x_stabilityN = common_cols_stabiltyN + geneL_ppi2_st_cols cols_to_mask = ['ligand_affinity_change', 'mcsm_ppi2_affinity'] if gene.lower() in geneL_na: # x_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] geneL_na_st_cols = ['mcsm_na_affinity'] x_stabilityN = common_cols_stabiltyN + geneL_na_st_cols cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity'] if gene.lower() in geneL_na_ppi2: # x_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] geneL_na_ppi2_st_cols = ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] x_stabilityN = common_cols_stabiltyN + geneL_na_ppi2_st_cols cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity', 'mcsm_ppi2_affinity'] #%% Masking columns (mCSM-lig, mCSM-NA, mCSM-ppi2) values for lig_dist >10 my_df_ml['mutationinformation'][my_df_ml['ligand_distance']>10].value_counts() my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts() my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask].value_counts() # mask the column ligand distance > 10 my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask] = 0 (my_df_ml['ligand_affinity_change'] == 0).sum() mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask] for i in range(len(cols_to_mask)): ind = i+1 print('\nindex:', i, '\nind:', ind) print('\nMask count check:' , (my_df_ml[cols_to_mask[i]]==0).sum() == (my_df_ml['ligand_distance']>10).sum() ) (my_df_ml[cols_to_mask[0]]==0).sum() == (my_df_ml['ligand_distance']>10).sum() (my_df_ml[cols_to_mask[1]]==0).sum() == (my_df_ml['ligand_distance']>10).sum()