tidyed up ML data processing for actual processing

This commit is contained in:
Tanushree Tunstall 2022-05-29 06:03:36 +01:00
parent cbfbb525fa
commit 693a5324c1
2 changed files with 118 additions and 90 deletions

View file

@ -94,16 +94,17 @@ check = ['mutationinformation', 'wild_type', 'water_change', 'polarity_change',
check_prop_cols = ['water_change', 'polarity_change', 'electrostatics_change']
foo = my_df[check]
foo['new'] = (foo.values == detect_change).any(1).astype(int)
#foo['new2'] = foo[check_prop_cols].applymap(lambda x: detect_change in x).any(1).astype(int) # lose match so alwasys 1
foo['new3'] = (foo[check_prop_cols].values == detect_change).any(1).astype(int)
foo['aa_prop_change'] = (foo.values == detect_change).any(1).astype(int)
#foo['aa_prop_change3'] = foo[check_prop_cols].applymap(lambda x: detect_change in x).any(1).astype(int) # lose match so alwasys 1
foo['aa_prop_change2'] = (foo[check_prop_cols].values == detect_change).any(1).astype(int)
all(foo['new'] == foo['new3'])
all(foo['aa_prop_change'] == foo['aa_prop_change2'])
#%%lineage
lineage_colnames = ['lineage', 'lineage_list_all', 'lineage_count_all', 'lineage_count_unique', 'lineage_list_unique', 'lineage_multimode']
# snp freq and lineage_count_all differ because same mut can be in more than 1 lineage
lineage_colnames = ['snp_frequency', 'lineage', 'lineage_list_all', 'lineage_count_all', 'lineage_count_unique', 'lineage_list_unique', 'lineage_multimode']
bar = my_df[lineage_colnames]
tot_lineage_u = 8
bar['lineage'].value_counts()
bar['lineage_proportion'] = bar['lineage_count_unique']/bar['lineage_count_all']
bar['dist_lineage_proportion'] = bar['lineage_count_unique']/tot_lineage_u