#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Wed May 25 02:01:19 2022 @author: tanu """ # TODO # categorical_cols = ['ss_class' # , 'wt_prop_water' # , 'mut_prop_water' # , 'wt_prop_polarity' # , 'mut_prop_polarity' # , 'wt_calcprop' # , 'mut_calcprop'] my_df['water_change'] = my_df['wt_prop_water'] + str('_to_') + my_df['mut_prop_water'] my_df['water_change'].value_counts() water_prop_changeD = { 'hydrophobic_to_neutral' : 'change' , 'hydrophobic_to_hydrophobic' : 'no_change' , 'neutral_to_neutral' : 'no_change' , 'neutral_to_hydrophobic' : 'change' , 'hydrophobic_to_hydrophilic' : 'change' , 'neutral_to_hydrophilic' : 'change' , 'hydrophilic_to_neutral' : 'change' , 'hydrophilic_to_hydrophobic' : 'change' , 'hydrophilic_to_hydrophilic' : 'no_change' } my_df['water_change'] = my_df['water_change'].map(water_prop_changeD) my_df['water_change'].value_counts() #%% my_df['polarity_change'] = my_df['wt_prop_polarity'] + str('_to_') + my_df['mut_prop_polarity'] my_df['polarity_change'].value_counts() # add a no change category polarity_prop_changeD = { 'non-polar_to_non-polar' : 'no_change' , 'non-polar_to_neutral' : 'change' , 'neutral_to_non-polar' : 'change' , 'neutral_to_neutral' : 'no_change' , 'non-polar_to_basic' : 'change' , 'acidic_to_neutral' : 'change' , 'basic_to_neutral' : 'change' , 'non-polar_to_acidic' : 'change' , 'neutral_to_basic' : 'change' , 'acidic_to_non-polar' : 'change' , 'basic_to_non-polar' : 'change' , 'neutral_to_acidic' : 'change' , 'acidic_to_acidic' : 'no_change' , 'basic_to_acidic' : 'change' , 'basic_to_basic' : 'no_change' , 'acidic_to_basic' : 'change'} my_df['polarity_change'] = my_df['polarity_change'].map(polarity_prop_changeD) my_df['polarity_change'].value_counts() #%% my_df['electrostatics_change'] = my_df['wt_calcprop'] + str('_to_') + my_df['mut_calcprop'] my_df['electrostatics_change'].value_counts() calc_prop_changeD = { 'non-polar_to_non-polar' : 'no_change' , 'non-polar_to_polar' : 'change' , 'polar_to_non-polar' : 'change' , 'non-polar_to_pos' : 'change' , 'neg_to_non-polar' : 'change' , 'non-polar_to_neg' : 'change' , 'pos_to_polar' : 'change' , 'pos_to_non-polar' : 'change' , 'polar_to_polar' : 'no_change' , 'neg_to_neg' : 'no_change' , 'polar_to_neg' : 'change' , 'pos_to_neg' : 'change' , 'pos_to_pos' : 'no_change' , 'polar_to_pos' : 'change' , 'neg_to_polar' : 'change' , 'neg_to_pos' : 'change' } my_df['electrostatics_change'] = my_df['electrostatics_change'].map(calc_prop_changeD) my_df['electrostatics_change'].value_counts() #%% #https://stackoverflow.com/questions/47181187/finding-string-over-multiple-columns-in-pandas detect_change = 'change' # if detect_change in my_df['water_change'] | my_df['polarity_change'] | my_df['electrostatics_change']: # print('\nChange detected') check = ['mutationinformation', 'wild_type', 'water_change', 'polarity_change', 'electrostatics_change'] check_prop_cols = ['water_change', 'polarity_change', 'electrostatics_change'] foo = my_df[check] foo['aa_prop_change'] = (foo.values == detect_change).any(1).astype(int) #foo['aa_prop_change3'] = foo[check_prop_cols].applymap(lambda x: detect_change in x).any(1).astype(int) # lose match so alwasys 1 foo['aa_prop_change2'] = (foo[check_prop_cols].values == detect_change).any(1).astype(int) all(foo['aa_prop_change'] == foo['aa_prop_change2']) #%%lineage # snp freq and lineage_count_all differ because same mut can be in more than 1 lineage lineage_colnames = ['snp_frequency', 'lineage', 'lineage_list_all', 'lineage_count_all', 'lineage_count_unique', 'lineage_list_unique', 'lineage_multimode'] bar = my_df[lineage_colnames] tot_lineage_u = 8 bar['lineage'].value_counts() bar['lineage_proportion'] = bar['lineage_count_unique']/bar['lineage_count_all'] bar['dist_lineage_proportion'] = bar['lineage_count_unique']/tot_lineage_u