diff --git a/test_data/processing.py b/test_data/processing.py index 011eae2..f75afee 100644 --- a/test_data/processing.py +++ b/test_data/processing.py @@ -9,53 +9,182 @@ import sys, os import pandas as pd import numpy as np from statistics import mean, median, mode -#from statistics import multimode +from statistics import multimode from collections import Counter -import math +#import math # https://stackoverflow.com/questions/43321455/pandas-count-null-values-in-a-groupby-function +# https://stackoverflow.com/questions/33457191/python-pandas-dataframe-fill-nans-with-a-conditional-mean +# round up +#int(math.ceil(mean(foo))) #https://stackoverflow.com/questions/33457191/python-pandas-dataframe-fill-nans-with-a-conditional-mean -#%% +#%% Read data and formatting drug = "pyrazinamide" data = pd.read_csv("/home/tanu/git/ML_AI_training/test_data/sample_data.csv") data.columns +# COPY mutation_info_labels column +data['mutation_info_labels_orig'] = data['mutation_info_labels'] + # Convert DM/OM labels to numeric dm_om_map = {'DM': 1, 'OM': 0} # pnca, OM is minority, other genes: DM is minority data['dm_om_numeric'] = data['mutation_info_labels'].map(dm_om_map) # sanity check -data['dm_om_numeric'].value_counts() +data['dm_om_numeric'].value_counts() data['mutation_info_labels'].value_counts() +# Convert drtype column to numeric +drtype_map = {'XDR': 5 + , 'Pre-XDR': 4 + , 'MDR': 3 + , 'Pre-MDR': 2 + , 'Other': 1 + , 'Sensitive': 0} + +data['drtype_numeric'] = data['drtype'].map(drtype_map) + # COPY dst column -data['dst'] = data[drug] +data['dst'] = data[drug] # to allow cross checking +data['dst_multimode'] = data[drug] + # sanity check data[drug].value_counts() -data[drug].isnull().sum() +data['dst_multimode'].value_counts() -data['dst'].value_counts() -data['dst'].isnull().sum() +data[drug].isnull().sum() +data['dst_multimode'].isnull().sum() data['mutationinformation'].value_counts() #data.C.isnull().groupby([df['A'],df['B']]).sum().astype(int).reset_index(name='count') data[drug].isnull().groupby(data['mutationinformation']).sum() # GOAL is to populate na in the dst column from the count of the dm_om_numeric column -data['dst'].isnull().groupby(data['mutationinformation']).sum() +data['dst_multimode'].isnull().groupby(data['mutationinformation']).sum() +# COPY mutationinformation for sanity check +data['mutation'] = data['mutationinformation'] -# round up -int(math.ceil(mean(foo))) -#https://stackoverflow.com/questions/33457191/python-pandas-dataframe-fill-nans-with-a-conditional-mean -#FIXME +#%% POC: fill na with mean/mode/median/max for each mutation # STAGE 1: replace mean with Max(multimode), atm it is MEAN #na_val = data.groupby(data['mutationinformation'])['dst'].mean() +data['dst_multimode'].fillna(data.groupby('mutationinformation')['dst_multimode'].transform('mean')) +data['dst_multimode'].fillna(data.groupby('mutationinformation')['dm_om_numeric'].transform('mean')) -data['dst'] = data['dst'].fillna(data.groupby('mutationinformation')['dst'].transform('mean')) +# STAGE 2: Fill TRUE nan with DM.OM column value, atm it is MEAN +#data['dst_mean_check'] = data['dst'].fillna(data.groupby('mutationinformation')['dst'].transform('mean')) +#data['dst_mean_check'] = data['dst'].fillna(data.groupby('mutationinformation')['dm_om_numeric'].transform('mean')) +#%% POC continued: Test getting mode +#data.groupby('mutationinformation')['dm_om_numeric'].mode() +data.groupby('mutationinformation')['dm_om_numeric'].agg(mode) +data.groupby('mutationinformation')['dm_om_numeric'].agg(multimode) +foo = data.groupby('mutationinformation')['dm_om_numeric'].agg(multimode) +foo +foo = foo.to_frame() +foo['dm_om_numeric'].apply(lambda x: max(x))# returns nan +foo['dm_om_numeric'].apply(lambda x: np.nanmax(x)) +#foo.assign(dst_mode = lambda x: (x['dst'])) +foo['multimode_extract'] = foo['dm_om_numeric'].apply(lambda x: max(x)) +foo['multimode_extract'] +#%% Recalculating columns [dst, drtype and mutation_info_labels]: SET Index as 'mutationinformation' +data2 = data.copy() +# Reset index as it allows the groupby expression to directly map +data2 = data2.set_index(['mutationinformation']) +#%% Recalculating dst: my data +#------------------------------ +# Revised dst: max(multimode) +#------------------------------ +# For each mutation, generate the revised dst which is the mode of dm_om_numeric +# PROBLEM: Returns the smallest of the two when bimodal, and fails when all equally likely +# SOLUTION: Using max of the 'dst_noNA' column +#data2.groupby('mutationinformation')['dm_om_numeric'].agg(multimode) -# FIXME -#STAGE 2: Fill TRUE nan with DM.OM column value, atm it is MEAN -data['dst2'] = data['dst'].fillna(data.groupby('mutationinformation')['dm_om_numeric'].transform('mean')) -data['dst2'] = data['dst'].fillna(data.groupby('mutationinformation').transform(['dm_om_numeric'])) - \ No newline at end of file +# Get multimode for dm_om_numeric column +dm_om_multimode = data2.groupby('mutationinformation')['dm_om_numeric'].agg(multimode) +#dm_om_multimode + +# Fill using multimode ONLY where NA in dst_multimode column +#data2['dst_multimode'] = data2['dst_multimode'].fillna(dm_om_multimode) +data2['dst_multimode'] = data2['dst_multimode'].fillna(dm_om_multimode) + +# data2['dst_multimode'] + +# Now get the max from multimode +data2['dst_noNA'] = data2['dst_multimode'].apply(lambda x: np.nanmax(x)) +print(data2) + +# Finally created a revised dst with the max from the multimode +data2['dst_mode'] = data2.groupby('mutationinformation')['dst_noNA'].max() +#============================================================================== +#%% Recalculating drtype: my data +#-------------------------------- +# drtype: ALL values: +# numeric and names in an array +#-------------------------------- +data2['drtype_all_vals'] = data2['drtype_numeric'] +data2['drtype_all_names'] = data2['drtype'] + +# example: https://stackoverflow.com/questions/55125680/pandas-get-all-groupby-values-in-an-array +# print(df.groupby('key').data.apply(list).reset_index()) # my use case, don't need the reset_index() +data2['drtype_all_vals'] = data2.groupby('mutationinformation').drtype_all_vals.apply(list) +data2['drtype_all_names'] = data2.groupby('mutationinformation').drtype_all_names.apply(list) + +#--------------------------------- +# Revised drtype: max(Multimode) +#-------------------------------- +data2['drtype_multimode'] = data2.groupby(['mutationinformation'])['drtype_numeric'].agg(multimode) +data2['drtype_multimode'] + +# Now get the max from multimode +data2['drtype_mode'] = data2['drtype_multimode'].apply(lambda x: np.nanmax(x)) +data2.head() + +#---------------------- +# Revised drtype: Max +#---------------------- +data2.head() +data2['drtype_max'] = data2.groupby(['mutationinformation'])['drtype_numeric'].max() +#data2 = data2.reset_index() +data2.head() + +#%% Finally reset index +data2 = data2.reset_index() +#============================================================================== +#--------------------------------------- +# Create revised mutation_info_column +#--------------------------------------- +data2['dst_mode'].value_counts() +data2[drug].value_counts() + +# note this is overriding, since downstream depends on it +# make a copy you if you need to keep that +data2['mutation_info_labels_orig'] = data2['mutation_info_labels'] +data2['mutation_info_labels'] = data2['dst_mode'].map({1: 'DM' + , 0: 'OM'}) +data2['mutation_info_labels_orig'].value_counts() +data2['mutation_info_labels'].value_counts() +#============================================================================== +# sanity check +if (all(data2['mutation'] == data2['mutationinformation'])): + print('\nPass: Mutationinformation check successful') +else: + sys.exit('\nERROR: mutationin cross checks failed. Please check your group_by() aggregate functions') + +# Drop mutation column +data2.drop(['mutation'], axis=1, inplace=True) + +#%% Process lineage info +# add how many different lineages a sample is represented in? +#%% subset: equivalent of merged_df3? +# https://stackoverflow.com/questions/39900061/sort-lists-in-a-pandas-dataframe-column + +# result = data2['dst_multimode'].sort_values().apply(lambda x: sorted(x)) +# newdf = pd.DataFrame({'dst_multimode': Series(list(set(result['a'].apply(tuple))))}) +# newdf.sort_values(by='a') + +# data2['dst_multimode'].value_counts() +# data2.sort_values(['dst_multimode'], ascending=False) + +data_df3 = data2.drop_duplicates(['mutationinformation']) +data_df3_v2 = data2.drop_duplicates(['mutationinformation']) +all(data_df3 == data_df3_v2) \ No newline at end of file diff --git a/test_data/sample_data.csv b/test_data/sample_data.csv index 5fd5515..f77e4aa 100644 --- a/test_data/sample_data.csv +++ b/test_data/sample_data.csv @@ -1,26 +1,26 @@ -id,mutationinformation,position,pyrazinamide,mutation_info_labels,drtype -S1,M1A,1,0,DM,MDR -S2,M1A,1,1,DM,Pre-MDR -S3,M1A,1,1,OM,Sensitive -S4,M1A,1,NA,OM,Others -S5,M1A,1,1,OM,Pre-XDR -S6,M1A,1,1,DM,XDR -S7,M1B,1,NA,OM,MDR -S8,M1B,1,1,DM,MDR -S9,M1B,1,NA,DM,Other -S10,M1B,1,0,OM,Sensitive -S11,M1C,1,NA,OM,Pre-XDR -S12,M1C,1,NA,OM,Pre-XDR -S13,M1C,1,1,OM,MDR -S14,M1C,1,NA,DM,MDR -S15,A2B,2,0,OM,Others -S16,A2B,2,0,OM,XDR -S17,A2C,2,NA,DM,Pre-MDR -S18,A2C,2,1,DM,Pre-MDR -S19,D3E,3,1,DM,XDR -S20,D3E,3,NA,DM,MDR -S21,D3E,3,NA,OM,Pre-MDR -S22,D3P,3,0,OM,Pre-MDR -S23,D3A,3,0,OM,Sensitive -S24,P4A,4,NA,OM,Others -S25,P5A,5,1,DM,Sensitive +sample,mutationinformation,position,pyrazinamide,mutation_info_labels,drtype,lineage +S1,M1A,1,0,DM,MDR,l1 +S2,M1A,1,1,DM,Pre-MDR,l2 +S3,M1A,1,1,OM,Sensitive,l1 +S4,M1A,1,NA,OM,Other,l3 +S5,M1A,1,1,OM,Pre-XDR,l2 +S6,M1A,1,1,DM,XDR,l4 +S7,M1B,1,NA,OM,MDR,l1 +S8,M1B,1,1,DM,Other,l1 +S9,M1B,1,NA,DM,Other,l2 +S10,M1B,1,0,OM,Sensitive,l2 +S11,M1C,1,NA,OM,Pre-XDR,l3 +S12,M1C,1,NA,OM,Pre-XDR,l1 +S13,M1C,1,1,OM,MDR,l1 +S14,M1C,1,NA,DM,MDR,l2 +S15,A2B,2,0,OM,Other,l4 +S16,A2B,2,0,OM,XDR,l4 +S17,A2C,2,NA,DM,Pre-MDR,l5 +S18,A2C,2,1,DM,Pre-MDR,l1 +S19,D3E,3,1,DM,XDR,l2 +S20,D3E,3,NA,DM,MDR,l2 +S21,D3E,3,NA,OM,Pre-MDR,l1 +S22,D3P,3,0,OM,Pre-MDR,l2 +S23,D3A,3,0,OM,Sensitive,l5 +S24,P4A,4,NA,OM,Other,l6 +S25,P5A,5,1,DM,Sensitive,l4 diff --git a/test_data/sample_data.ods b/test_data/sample_data.ods index aff3cca..73a79a8 100644 Binary files a/test_data/sample_data.ods and b/test_data/sample_data.ods differ