diff --git a/test_data/processing.py b/test_data/processing.py index f75afee..135c174 100644 --- a/test_data/processing.py +++ b/test_data/processing.py @@ -175,6 +175,15 @@ data2.drop(['mutation'], axis=1, inplace=True) #%% Process lineage info # add how many different lineages a sample is represented in? +# https://stackoverflow.com/questions/37189878/pandas-add-column-to-groupby-dataframe +# https://stackoverflow.com/questions/43847520/how-to-get-the-distinct-count-of-values-in-a-python-pandas-dataframe +data2.groupby('mutationinformation')['lineage'].size() # sample count +data2.groupby('mutationinformation')['sample'].size() +data2.groupby('mutationinformation')['lineage'].value_counts() + +data2.groupby('mutationinformation')['lineage'].nunique() +data2['lin_count'] = data2['mutationinformation'].map(data2.groupby('mutationinformation')['lineage'].nunique()) + #%% subset: equivalent of merged_df3? # https://stackoverflow.com/questions/39900061/sort-lists-in-a-pandas-dataframe-column @@ -185,6 +194,7 @@ data2.drop(['mutation'], axis=1, inplace=True) # data2['dst_multimode'].value_counts() # data2.sort_values(['dst_multimode'], ascending=False) -data_df3 = data2.drop_duplicates(['mutationinformation']) -data_df3_v2 = data2.drop_duplicates(['mutationinformation']) -all(data_df3 == data_df3_v2) \ No newline at end of file +#data_df3 = data2.drop_duplicates(['mutationinformation']) +#data_df3_v2 = data2.drop_duplicates(['mutationinformation']) +#all(data_df3 == data_df3_v2) +#%% \ No newline at end of file