ML_AI_training/UQ_TODO_categorical_classification_columns.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed May 25 02:01:19 2022

@author: tanu
"""
# TODO
# categorical_cols = ['ss_class'
#                     , 'wt_prop_water'
#                     , 'mut_prop_water'
#                     , 'wt_prop_polarity'
#                     , 'mut_prop_polarity'
#                     , 'wt_calcprop'
#                     , 'mut_calcprop']

my_df['water_change'] = my_df['wt_prop_water'] + str('_to_') + my_df['mut_prop_water']
my_df['water_change'].value_counts()
water_prop_changeD = {
    'hydrophobic_to_neutral'          : 'change'
    , 'hydrophobic_to_hydrophobic'    : 'no_change'
    , 'neutral_to_neutral'            : 'no_change'
    , 'neutral_to_hydrophobic'        : 'change'
    , 'hydrophobic_to_hydrophilic'    : 'change'
    , 'neutral_to_hydrophilic'        : 'change'
    , 'hydrophilic_to_neutral'        : 'change'
    , 'hydrophilic_to_hydrophobic'    : 'change'
    , 'hydrophilic_to_hydrophilic'    : 'no_change'
}

my_df['water_change'] = my_df['water_change'].map(water_prop_changeD)
my_df['water_change'].value_counts()

#%%
my_df['polarity_change'] = my_df['wt_prop_polarity'] + str('_to_') + my_df['mut_prop_polarity']
my_df['polarity_change'].value_counts()
# add a no change category

polarity_prop_changeD = {
    'non-polar_to_non-polar'     : 'no_change'
    , 'non-polar_to_neutral'     : 'change'
    , 'neutral_to_non-polar'     : 'change'
    , 'neutral_to_neutral'       : 'no_change'
    , 'non-polar_to_basic'       : 'change'
    , 'acidic_to_neutral'        : 'change'
    , 'basic_to_neutral'         : 'change'
    , 'non-polar_to_acidic'      : 'change'
    , 'neutral_to_basic'         : 'change'
    , 'acidic_to_non-polar'      : 'change'
    , 'basic_to_non-polar'       : 'change'
    , 'neutral_to_acidic'        : 'change'
    , 'acidic_to_acidic'         : 'no_change'
    , 'basic_to_acidic'          : 'change'
    , 'basic_to_basic'           : 'no_change'
    , 'acidic_to_basic'          : 'change'}

my_df['polarity_change'] = my_df['polarity_change'].map(polarity_prop_changeD)
my_df['polarity_change'].value_counts()

#%%
my_df['electrostatics_change'] = my_df['wt_calcprop'] + str('_to_') + my_df['mut_calcprop']
my_df['electrostatics_change'].value_counts()

calc_prop_changeD = {
        'non-polar_to_non-polar'     : 'no_change'
        , 'non-polar_to_polar'       : 'change'
        , 'polar_to_non-polar'       : 'change'
        , 'non-polar_to_pos'         : 'change'
        , 'neg_to_non-polar'         : 'change'
        , 'non-polar_to_neg'         : 'change'
        , 'pos_to_polar'             : 'change'
        , 'pos_to_non-polar'         : 'change'
        , 'polar_to_polar'           : 'no_change'
        , 'neg_to_neg'               : 'no_change'
        , 'polar_to_neg'             : 'change'
        , 'pos_to_neg'               : 'change'
        , 'pos_to_pos'               : 'no_change'
        , 'polar_to_pos'             : 'change'
        , 'neg_to_polar'             : 'change'
        , 'neg_to_pos'               : 'change'
}

my_df['electrostatics_change'] = my_df['electrostatics_change'].map(calc_prop_changeD)
my_df['electrostatics_change'].value_counts()

#%%
#https://stackoverflow.com/questions/47181187/finding-string-over-multiple-columns-in-pandas
detect_change = 'change'

# if detect_change in my_df['water_change'] | my_df['polarity_change'] | my_df['electrostatics_change']:
#     print('\nChange detected')

check = ['mutationinformation', 'wild_type', 'water_change', 'polarity_change', 'electrostatics_change']
check_prop_cols = ['water_change', 'polarity_change', 'electrostatics_change']
foo = my_df[check]

foo['aa_prop_change'] = (foo.values == detect_change).any(1).astype(int)
#foo['aa_prop_change3'] = foo[check_prop_cols].applymap(lambda x: detect_change in x).any(1).astype(int) # lose match so alwasys 1
foo['aa_prop_change2'] = (foo[check_prop_cols].values == detect_change).any(1).astype(int)

all(foo['aa_prop_change'] == foo['aa_prop_change2'])
#%%lineage
# snp freq and lineage_count_all differ because same mut can be in more than 1 lineage
lineage_colnames = ['snp_frequency', 'lineage', 'lineage_list_all', 'lineage_count_all', 'lineage_count_unique', 'lineage_list_unique', 'lineage_multimode']
bar = my_df[lineage_colnames]

tot_lineage_u = 8
bar['lineage'].value_counts()
bar['lineage_proportion'] = bar['lineage_count_unique']/bar['lineage_count_all']
bar['dist_lineage_proportion'] = bar['lineage_count_unique']/tot_lineage_u