diff --git a/test_data/processing.py b/test_data/processing.py index 135c174..e05e3f7 100644 --- a/test_data/processing.py +++ b/test_data/processing.py @@ -11,19 +11,66 @@ import numpy as np from statistics import mean, median, mode from statistics import multimode from collections import Counter +from tidy_split import tidy_split + #import math # https://stackoverflow.com/questions/43321455/pandas-count-null-values-in-a-groupby-function # https://stackoverflow.com/questions/33457191/python-pandas-dataframe-fill-nans-with-a-conditional-mean # round up -#int(math.ceil(mean(foo))) -#https://stackoverflow.com/questions/33457191/python-pandas-dataframe-fill-nans-with-a-conditional-mean +# int(math.ceil(mean(foo))) +# https://stackoverflow.com/questions/33457191/python-pandas-dataframe-fill-nans-with-a-conditional-mean +# https://stackoverflow.com/questions/37189878/pandas-add-column-to-groupby-dataframe +# https://stackoverflow.com/questions/43847520/how-to-get-the-distinct-count-of-values-in-a-python-pandas-dataframe #%% Read data and formatting drug = "pyrazinamide" data = pd.read_csv("/home/tanu/git/ML_AI_training/test_data/sample_data.csv") data.columns +data.head() +#%% Quick checks: Lineage and sample count for each mutation +data['id'].nunique() +data['mutationinformation'].nunique() +total_id_ucount = data['id'].nunique() +total_id_ucount + +data.groupby('mutationinformation')['lineage'].size() +data.groupby('mutationinformation')['lineage_corrupt'].size() +data.groupby('mutationinformation')['id'].size() +data.groupby('mutationinformation')['lineage'].value_counts() +data.groupby('mutationinformation')['lineage'].nunique() +#%% id count: add all id ids and count of unique ids per mutation +data['id_list'] = data['mutationinformation'].map(data.groupby('mutationinformation')['id'].apply(list)) +data['id_ucount'] = data['mutationinformation'].map(data.groupby('mutationinformation')['id'].nunique()) +data[['mutationinformation', 'id', 'id_list', 'id_ucount']] +#%% Lineages: add all lineages and count of unique lineages per mutation +# Lineages good: lineage column has only a single lineage for each mutationinformation +data['lineage'] +data['lineage_list'] = data['mutationinformation'].map(data.groupby('mutationinformation')['lineage'].apply(list)) +data['lineage_ucount'] = data['mutationinformation'].map(data.groupby('mutationinformation')['lineage'].nunique()) +data[['mutationinformation', 'lineage', 'lineage_list', 'lineage_ucount']] + +# Lineage corrupt: lineage column has only multiple lineages for each mutationinformation separated by ';' +data['lineage_corrupt'] +# split using tidy_split() +data_split = tidy_split(data, 'lineage_corrupt', sep = ';') +# remove leading white space else these are counted as distinct mutations as well +#data_split['lineage_corrupt'] = data_split['lineage_corrupt'].str.lstrip() +data_split['lineage_corrupt'] = data_split['lineage_corrupt'].str.strip() +data_split.head() + +data_split['lineage_corrupt_list'] = data_split['mutationinformation'].map(data_split.groupby('mutationinformation')['lineage_corrupt'].apply(list)) +data_split['lineage_corrupt_ucount'] = data_split['mutationinformation'].map(data_split.groupby('mutationinformation')['lineage_corrupt'].nunique()) + +data_split[['mutationinformation', 'lineage_corrupt_list', 'lineage_corrupt_ucount']] +data_split[['mutationinformation', 'lineage_ucount', 'lineage_corrupt_ucount']] + +#%% AF: calculate AF for each mutation +#1) calculate no. of unique ids +data['id_ucount']/total_id_ucount + +#%% DM OM labels # COPY mutation_info_labels column data['mutation_info_labels_orig'] = data['mutation_info_labels'] @@ -172,18 +219,6 @@ else: # Drop mutation column data2.drop(['mutation'], axis=1, inplace=True) - -#%% Process lineage info -# add how many different lineages a sample is represented in? -# https://stackoverflow.com/questions/37189878/pandas-add-column-to-groupby-dataframe -# https://stackoverflow.com/questions/43847520/how-to-get-the-distinct-count-of-values-in-a-python-pandas-dataframe -data2.groupby('mutationinformation')['lineage'].size() # sample count -data2.groupby('mutationinformation')['sample'].size() -data2.groupby('mutationinformation')['lineage'].value_counts() - -data2.groupby('mutationinformation')['lineage'].nunique() -data2['lin_count'] = data2['mutationinformation'].map(data2.groupby('mutationinformation')['lineage'].nunique()) - #%% subset: equivalent of merged_df3? # https://stackoverflow.com/questions/39900061/sort-lists-in-a-pandas-dataframe-column diff --git a/test_data/sample_data.csv b/test_data/sample_data.csv index f77e4aa..39102a8 100644 --- a/test_data/sample_data.csv +++ b/test_data/sample_data.csv @@ -1,26 +1,28 @@ -sample,mutationinformation,position,pyrazinamide,mutation_info_labels,drtype,lineage -S1,M1A,1,0,DM,MDR,l1 -S2,M1A,1,1,DM,Pre-MDR,l2 -S3,M1A,1,1,OM,Sensitive,l1 -S4,M1A,1,NA,OM,Other,l3 -S5,M1A,1,1,OM,Pre-XDR,l2 -S6,M1A,1,1,DM,XDR,l4 -S7,M1B,1,NA,OM,MDR,l1 -S8,M1B,1,1,DM,Other,l1 -S9,M1B,1,NA,DM,Other,l2 -S10,M1B,1,0,OM,Sensitive,l2 -S11,M1C,1,NA,OM,Pre-XDR,l3 -S12,M1C,1,NA,OM,Pre-XDR,l1 -S13,M1C,1,1,OM,MDR,l1 -S14,M1C,1,NA,DM,MDR,l2 -S15,A2B,2,0,OM,Other,l4 -S16,A2B,2,0,OM,XDR,l4 -S17,A2C,2,NA,DM,Pre-MDR,l5 -S18,A2C,2,1,DM,Pre-MDR,l1 -S19,D3E,3,1,DM,XDR,l2 -S20,D3E,3,NA,DM,MDR,l2 -S21,D3E,3,NA,OM,Pre-MDR,l1 -S22,D3P,3,0,OM,Pre-MDR,l2 -S23,D3A,3,0,OM,Sensitive,l5 -S24,P4A,4,NA,OM,Other,l6 -S25,P5A,5,1,DM,Sensitive,l4 +id,old,mutationinformation,position,pyrazinamide,mutation_info_labels,drtype,lineage_corrupt,lineage +S1,S1,M1A,1,0,DM,MDR,l1; l3; l4 ,l1 +S2,S2,M1A,1,1,DM,Pre-MDR,l2,l2 +S3,S3,M1A,1,1,OM,Sensitive,l1,l1 +S4,S4,M1A,1,NA,OM,Other,l3,l3 +S5,S5,M1A,1,1,OM,Pre-XDR,l2,l2 +S6,S6,M1A,1,1,DM,XDR,l4,l3 +S1,S7,M1B,1,NA,OM,MDR,l1,l1 +S7,S8,M1B,1,1,DM,Other,l1,l1 +S8,S9,M1B,1,NA,DM,Other,l2,l2 +S2,S10,M1B,1,0,OM,Sensitive,l2,l2 +S3,S11,M1C,1,NA,OM,Pre-XDR,l3,l3 +S4,S12,M1C,1,NA,OM,Pre-XDR,l1,l1 +S8,S13,M1C,1,1,OM,MDR,l1,l1 +S8,S14,M1C,1,NA,DM,MDR,l2,l2 +S4,S15,A2B,2,0,OM,Other,l4,l4 +S3,S16,A2B,2,0,OM,XDR,l4,l4 +S2,S17,A2C,2,NA,DM,Pre-MDR,l5,l5 +S1,S18,A2C,2,1,DM,Pre-MDR,l1,l1 +S7,S19,D3E,3,1,DM,XDR,l2,l2 +S8,S20,D3E,3,NA,DM,MDR,l2,l2 +S8,S21,D3E,3,NA,OM,Pre-MDR,l1,l1 +S5,S22,D3P,3,0,OM,Pre-MDR,l2,l2 +S6,S23,D3A,3,0,OM,Sensitive,l5,l5 +S7,S24,P4A,4,NA,OM,Other,l6,l6 +S8,S25,P5A,5,1,DM,Sensitive,l4,l4 +S8,S26,Q6L,6,1,DM,Others,l2,l2 +S4,S27,Q6L,6,NA,OM,MDR,l5; l2,l5