diff --git a/test_data/processing.py b/test_data/processing.py
index 011eae2..f75afee 100644
--- a/test_data/processing.py
+++ b/test_data/processing.py
@@ -9,53 +9,182 @@ import sys, os
 import pandas as pd
 import numpy as np
 from statistics import mean, median, mode
-#from statistics import multimode
+from statistics import multimode
 from collections import Counter
-import math
+#import math
 
 # https://stackoverflow.com/questions/43321455/pandas-count-null-values-in-a-groupby-function
+# https://stackoverflow.com/questions/33457191/python-pandas-dataframe-fill-nans-with-a-conditional-mean
+# round up
+#int(math.ceil(mean(foo)))  
 #https://stackoverflow.com/questions/33457191/python-pandas-dataframe-fill-nans-with-a-conditional-mean
-#%%
+#%% Read data and formatting
 drug = "pyrazinamide"
 
 data = pd.read_csv("/home/tanu/git/ML_AI_training/test_data/sample_data.csv")
 data.columns
 
+# COPY mutation_info_labels column
+data['mutation_info_labels_orig'] = data['mutation_info_labels']
+
 # Convert DM/OM labels to numeric
 dm_om_map = {'DM': 1, 'OM': 0} # pnca, OM is minority, other genes: DM is minority
 data['dm_om_numeric'] = data['mutation_info_labels'].map(dm_om_map)
 # sanity check
-data['dm_om_numeric'].value_counts()
+data['dm_om_numeric'].value_counts() 
 data['mutation_info_labels'].value_counts()
 
+# Convert drtype column to numeric
+drtype_map = {'XDR': 5
+              , 'Pre-XDR': 4
+              , 'MDR': 3
+              , 'Pre-MDR': 2
+              , 'Other': 1
+              , 'Sensitive': 0}
+
+data['drtype_numeric']  = data['drtype'].map(drtype_map)
+
 # COPY dst column
-data['dst'] = data[drug]
+data['dst'] = data[drug] # to allow cross checking
+data['dst_multimode'] = data[drug]
+
 # sanity check
 data[drug].value_counts()
-data[drug].isnull().sum()
+data['dst_multimode'].value_counts()
 
-data['dst'].value_counts()
-data['dst'].isnull().sum()
+data[drug].isnull().sum()
+data['dst_multimode'].isnull().sum()
 
 data['mutationinformation'].value_counts()
 #data.C.isnull().groupby([df['A'],df['B']]).sum().astype(int).reset_index(name='count')
 data[drug].isnull().groupby(data['mutationinformation']).sum()
                              
 # GOAL is to populate na in the dst column from the count of the dm_om_numeric column                          
-data['dst'].isnull().groupby(data['mutationinformation']).sum()                       
+data['dst_multimode'].isnull().groupby(data['mutationinformation']).sum()                       
 
+# COPY mutationinformation for sanity check
+data['mutation'] = data['mutationinformation']
 
-# round up
-int(math.ceil(mean(foo)))  
-#https://stackoverflow.com/questions/33457191/python-pandas-dataframe-fill-nans-with-a-conditional-mean
-#FIXME
+#%% POC: fill na with mean/mode/median/max for each mutation
 # STAGE 1: replace mean with  Max(multimode), atm it is MEAN
 #na_val = data.groupby(data['mutationinformation'])['dst'].mean()
+data['dst_multimode'].fillna(data.groupby('mutationinformation')['dst_multimode'].transform('mean'))
+data['dst_multimode'].fillna(data.groupby('mutationinformation')['dm_om_numeric'].transform('mean'))
 
-data['dst'] = data['dst'].fillna(data.groupby('mutationinformation')['dst'].transform('mean'))
+# STAGE 2: Fill TRUE nan with DM.OM column value, atm it is MEAN
+#data['dst_mean_check'] = data['dst'].fillna(data.groupby('mutationinformation')['dst'].transform('mean'))
+#data['dst_mean_check'] = data['dst'].fillna(data.groupby('mutationinformation')['dm_om_numeric'].transform('mean'))
+#%% POC continued: Test getting mode
+#data.groupby('mutationinformation')['dm_om_numeric'].mode()
+data.groupby('mutationinformation')['dm_om_numeric'].agg(mode)
+data.groupby('mutationinformation')['dm_om_numeric'].agg(multimode)
+foo = data.groupby('mutationinformation')['dm_om_numeric'].agg(multimode)
+foo
+foo = foo.to_frame()
+foo['dm_om_numeric'].apply(lambda x: max(x))# returns nan
+foo['dm_om_numeric'].apply(lambda x: np.nanmax(x))
+#foo.assign(dst_mode = lambda x: (x['dst']))
+foo['multimode_extract'] = foo['dm_om_numeric'].apply(lambda x: max(x))
+foo['multimode_extract'] 
+#%% Recalculating columns [dst, drtype and mutation_info_labels]: SET Index as 'mutationinformation'
+data2 = data.copy()
+# Reset index as it allows the groupby expression to directly map
+data2 = data2.set_index(['mutationinformation'])
+#%% Recalculating dst: my data
+#------------------------------
+# Revised dst: max(multimode)
+#------------------------------
+# For each mutation, generate the revised dst which is the mode of dm_om_numeric
+# PROBLEM: Returns the smallest of the two when bimodal, and fails when all equally likely
+# SOLUTION: Using max of the 'dst_noNA' column
+#data2.groupby('mutationinformation')['dm_om_numeric'].agg(multimode)
 
-# FIXME
-#STAGE 2: Fill TRUE nan with DM.OM column value, atm it is MEAN
-data['dst2'] = data['dst'].fillna(data.groupby('mutationinformation')['dm_om_numeric'].transform('mean'))
-data['dst2'] = data['dst'].fillna(data.groupby('mutationinformation').transform(['dm_om_numeric']))
-     
\ No newline at end of file
+# Get multimode for dm_om_numeric column
+dm_om_multimode = data2.groupby('mutationinformation')['dm_om_numeric'].agg(multimode)
+#dm_om_multimode
+
+# Fill using multimode ONLY where NA in dst_multimode column
+#data2['dst_multimode'] = data2['dst_multimode'].fillna(dm_om_multimode)
+data2['dst_multimode'] = data2['dst_multimode'].fillna(dm_om_multimode)
+
+# data2['dst_multimode']
+
+# Now get the max from multimode
+data2['dst_noNA'] = data2['dst_multimode'].apply(lambda x: np.nanmax(x))
+print(data2)
+
+# Finally created a revised dst with the max from the multimode
+data2['dst_mode']  = data2.groupby('mutationinformation')['dst_noNA'].max()
+#==============================================================================
+#%% Recalculating drtype: my data
+#--------------------------------
+# drtype: ALL values:
+# numeric and names in an array 
+#--------------------------------
+data2['drtype_all_vals']  = data2['drtype_numeric']
+data2['drtype_all_names'] = data2['drtype']
+
+# example: https://stackoverflow.com/questions/55125680/pandas-get-all-groupby-values-in-an-array
+# print(df.groupby('key').data.apply(list).reset_index()) # my use case, don't need the reset_index()
+data2['drtype_all_vals']  = data2.groupby('mutationinformation').drtype_all_vals.apply(list)
+data2['drtype_all_names'] = data2.groupby('mutationinformation').drtype_all_names.apply(list)
+
+#---------------------------------
+# Revised drtype: max(Multimode)
+#--------------------------------
+data2['drtype_multimode'] = data2.groupby(['mutationinformation'])['drtype_numeric'].agg(multimode)
+data2['drtype_multimode']
+
+# Now get the max from multimode
+data2['drtype_mode'] = data2['drtype_multimode'].apply(lambda x: np.nanmax(x))
+data2.head()
+
+#----------------------
+# Revised drtype: Max
+#----------------------
+data2.head()
+data2['drtype_max'] =  data2.groupby(['mutationinformation'])['drtype_numeric'].max()
+#data2 = data2.reset_index()
+data2.head()
+
+#%% Finally reset index
+data2 = data2.reset_index()
+#==============================================================================
+#---------------------------------------
+# Create revised mutation_info_column
+#---------------------------------------
+data2['dst_mode'].value_counts()
+data2[drug].value_counts()
+
+# note this is overriding, since downstream depends on it
+# make a copy you if you need to keep that
+data2['mutation_info_labels_orig'] =  data2['mutation_info_labels'] 
+data2['mutation_info_labels']  = data2['dst_mode'].map({1: 'DM'
+                                                    , 0: 'OM'})
+data2['mutation_info_labels_orig'].value_counts()
+data2['mutation_info_labels'].value_counts()
+#==============================================================================
+# sanity check
+if (all(data2['mutation'] == data2['mutationinformation'])):
+    print('\nPass: Mutationinformation check successful')
+else:
+    sys.exit('\nERROR: mutationin cross checks failed. Please check your group_by() aggregate functions')
+
+# Drop mutation column
+data2.drop(['mutation'], axis=1, inplace=True)
+
+#%% Process lineage info
+# add how many different lineages a sample is represented in?
+#%% subset: equivalent of merged_df3?
+# https://stackoverflow.com/questions/39900061/sort-lists-in-a-pandas-dataframe-column
+
+# result = data2['dst_multimode'].sort_values().apply(lambda x: sorted(x))
+# newdf = pd.DataFrame({'dst_multimode': Series(list(set(result['a'].apply(tuple))))})
+# newdf.sort_values(by='a')
+
+# data2['dst_multimode'].value_counts()
+# data2.sort_values(['dst_multimode'], ascending=False)
+
+data_df3 = data2.drop_duplicates(['mutationinformation'])
+data_df3_v2 = data2.drop_duplicates(['mutationinformation'])
+all(data_df3 == data_df3_v2)
\ No newline at end of file
diff --git a/test_data/sample_data.csv b/test_data/sample_data.csv
index 5fd5515..f77e4aa 100644
--- a/test_data/sample_data.csv
+++ b/test_data/sample_data.csv
@@ -1,26 +1,26 @@
-id,mutationinformation,position,pyrazinamide,mutation_info_labels,drtype
-S1,M1A,1,0,DM,MDR
-S2,M1A,1,1,DM,Pre-MDR
-S3,M1A,1,1,OM,Sensitive
-S4,M1A,1,NA,OM,Others
-S5,M1A,1,1,OM,Pre-XDR
-S6,M1A,1,1,DM,XDR
-S7,M1B,1,NA,OM,MDR
-S8,M1B,1,1,DM,MDR
-S9,M1B,1,NA,DM,Other
-S10,M1B,1,0,OM,Sensitive
-S11,M1C,1,NA,OM,Pre-XDR
-S12,M1C,1,NA,OM,Pre-XDR
-S13,M1C,1,1,OM,MDR
-S14,M1C,1,NA,DM,MDR
-S15,A2B,2,0,OM,Others
-S16,A2B,2,0,OM,XDR
-S17,A2C,2,NA,DM,Pre-MDR
-S18,A2C,2,1,DM,Pre-MDR
-S19,D3E,3,1,DM,XDR
-S20,D3E,3,NA,DM,MDR
-S21,D3E,3,NA,OM,Pre-MDR
-S22,D3P,3,0,OM,Pre-MDR
-S23,D3A,3,0,OM,Sensitive
-S24,P4A,4,NA,OM,Others
-S25,P5A,5,1,DM,Sensitive
+sample,mutationinformation,position,pyrazinamide,mutation_info_labels,drtype,lineage
+S1,M1A,1,0,DM,MDR,l1
+S2,M1A,1,1,DM,Pre-MDR,l2
+S3,M1A,1,1,OM,Sensitive,l1
+S4,M1A,1,NA,OM,Other,l3
+S5,M1A,1,1,OM,Pre-XDR,l2
+S6,M1A,1,1,DM,XDR,l4
+S7,M1B,1,NA,OM,MDR,l1
+S8,M1B,1,1,DM,Other,l1
+S9,M1B,1,NA,DM,Other,l2
+S10,M1B,1,0,OM,Sensitive,l2
+S11,M1C,1,NA,OM,Pre-XDR,l3
+S12,M1C,1,NA,OM,Pre-XDR,l1
+S13,M1C,1,1,OM,MDR,l1
+S14,M1C,1,NA,DM,MDR,l2
+S15,A2B,2,0,OM,Other,l4
+S16,A2B,2,0,OM,XDR,l4
+S17,A2C,2,NA,DM,Pre-MDR,l5
+S18,A2C,2,1,DM,Pre-MDR,l1
+S19,D3E,3,1,DM,XDR,l2
+S20,D3E,3,NA,DM,MDR,l2
+S21,D3E,3,NA,OM,Pre-MDR,l1
+S22,D3P,3,0,OM,Pre-MDR,l2
+S23,D3A,3,0,OM,Sensitive,l5
+S24,P4A,4,NA,OM,Other,l6
+S25,P5A,5,1,DM,Sensitive,l4
diff --git a/test_data/sample_data.ods b/test_data/sample_data.ods
index aff3cca..73a79a8 100644
Binary files a/test_data/sample_data.ods and b/test_data/sample_data.ods differ