saving work

2022-04-05 14:51:21 +01:00 · 2022-04-05 14:51:21 +01:00 · c647773520
commit c647773520
parent 6a9d23ec8f
3 changed files with 174 additions and 45 deletions
--- a/test_data/processing.py
+++ b/test_data/processing.py
@ -9,53 +9,182 @@ import sys, os
 import pandas as pd
 import numpy as np
 from statistics import mean, median, mode
-#from statistics import multimode
+from statistics import multimode
 from collections import Counter
-import math
+#import math

 # https://stackoverflow.com/questions/43321455/pandas-count-null-values-in-a-groupby-function
+# https://stackoverflow.com/questions/33457191/python-pandas-dataframe-fill-nans-with-a-conditional-mean
+# round up
+#int(math.ceil(mean(foo)))  
 #https://stackoverflow.com/questions/33457191/python-pandas-dataframe-fill-nans-with-a-conditional-mean
-#%%
+#%% Read data and formatting
 drug = "pyrazinamide"

 data = pd.read_csv("/home/tanu/git/ML_AI_training/test_data/sample_data.csv")
 data.columns

+# COPY mutation_info_labels column
+data['mutation_info_labels_orig'] = data['mutation_info_labels']
+
 # Convert DM/OM labels to numeric
 dm_om_map = {'DM': 1, 'OM': 0} # pnca, OM is minority, other genes: DM is minority
 data['dm_om_numeric'] = data['mutation_info_labels'].map(dm_om_map)
 # sanity check
-data['dm_om_numeric'].value_counts()
+data['dm_om_numeric'].value_counts() 
 data['mutation_info_labels'].value_counts()

+# Convert drtype column to numeric
+drtype_map = {'XDR': 5
+              , 'Pre-XDR': 4
+              , 'MDR': 3
+              , 'Pre-MDR': 2
+              , 'Other': 1
+              , 'Sensitive': 0}
+
+data['drtype_numeric']  = data['drtype'].map(drtype_map)
+
 # COPY dst column
-data['dst'] = data[drug]
+data['dst'] = data[drug] # to allow cross checking
+data['dst_multimode'] = data[drug]
+
 # sanity check
 data[drug].value_counts()
-data[drug].isnull().sum()
+data['dst_multimode'].value_counts()

-data['dst'].value_counts()
-data['dst'].isnull().sum()
+data[drug].isnull().sum()
+data['dst_multimode'].isnull().sum()

 data['mutationinformation'].value_counts()
 #data.C.isnull().groupby([df['A'],df['B']]).sum().astype(int).reset_index(name='count')
 data[drug].isnull().groupby(data['mutationinformation']).sum()
                             
 # GOAL is to populate na in the dst column from the count of the dm_om_numeric column                          
-data['dst'].isnull().groupby(data['mutationinformation']).sum()                       
+data['dst_multimode'].isnull().groupby(data['mutationinformation']).sum()                       

+# COPY mutationinformation for sanity check
+data['mutation'] = data['mutationinformation']

-# round up
-int(math.ceil(mean(foo)))  
-#https://stackoverflow.com/questions/33457191/python-pandas-dataframe-fill-nans-with-a-conditional-mean
-#FIXME
+#%% POC: fill na with mean/mode/median/max for each mutation
 # STAGE 1: replace mean with  Max(multimode), atm it is MEAN
 #na_val = data.groupby(data['mutationinformation'])['dst'].mean()
+data['dst_multimode'].fillna(data.groupby('mutationinformation')['dst_multimode'].transform('mean'))
+data['dst_multimode'].fillna(data.groupby('mutationinformation')['dm_om_numeric'].transform('mean'))

-data['dst'] = data['dst'].fillna(data.groupby('mutationinformation')['dst'].transform('mean'))
+# STAGE 2: Fill TRUE nan with DM.OM column value, atm it is MEAN
+#data['dst_mean_check'] = data['dst'].fillna(data.groupby('mutationinformation')['dst'].transform('mean'))
+#data['dst_mean_check'] = data['dst'].fillna(data.groupby('mutationinformation')['dm_om_numeric'].transform('mean'))
+#%% POC continued: Test getting mode
+#data.groupby('mutationinformation')['dm_om_numeric'].mode()
+data.groupby('mutationinformation')['dm_om_numeric'].agg(mode)
+data.groupby('mutationinformation')['dm_om_numeric'].agg(multimode)
+foo = data.groupby('mutationinformation')['dm_om_numeric'].agg(multimode)
+foo
+foo = foo.to_frame()
+foo['dm_om_numeric'].apply(lambda x: max(x))# returns nan
+foo['dm_om_numeric'].apply(lambda x: np.nanmax(x))
+#foo.assign(dst_mode = lambda x: (x['dst']))
+foo['multimode_extract'] = foo['dm_om_numeric'].apply(lambda x: max(x))
+foo['multimode_extract'] 
+#%% Recalculating columns [dst, drtype and mutation_info_labels]: SET Index as 'mutationinformation'
+data2 = data.copy()
+# Reset index as it allows the groupby expression to directly map
+data2 = data2.set_index(['mutationinformation'])
+#%% Recalculating dst: my data
+#------------------------------
+# Revised dst: max(multimode)
+#------------------------------
+# For each mutation, generate the revised dst which is the mode of dm_om_numeric
+# PROBLEM: Returns the smallest of the two when bimodal, and fails when all equally likely
+# SOLUTION: Using max of the 'dst_noNA' column
+#data2.groupby('mutationinformation')['dm_om_numeric'].agg(multimode)

-# FIXME
-#STAGE 2: Fill TRUE nan with DM.OM column value, atm it is MEAN
-data['dst2'] = data['dst'].fillna(data.groupby('mutationinformation')['dm_om_numeric'].transform('mean'))
-data['dst2'] = data['dst'].fillna(data.groupby('mutationinformation').transform(['dm_om_numeric']))
-     
+# Get multimode for dm_om_numeric column
+dm_om_multimode = data2.groupby('mutationinformation')['dm_om_numeric'].agg(multimode)
+#dm_om_multimode
+
+# Fill using multimode ONLY where NA in dst_multimode column
+#data2['dst_multimode'] = data2['dst_multimode'].fillna(dm_om_multimode)
+data2['dst_multimode'] = data2['dst_multimode'].fillna(dm_om_multimode)
+
+# data2['dst_multimode']
+
+# Now get the max from multimode
+data2['dst_noNA'] = data2['dst_multimode'].apply(lambda x: np.nanmax(x))
+print(data2)
+
+# Finally created a revised dst with the max from the multimode
+data2['dst_mode']  = data2.groupby('mutationinformation')['dst_noNA'].max()
+#==============================================================================
+#%% Recalculating drtype: my data
+#--------------------------------
+# drtype: ALL values:
+# numeric and names in an array 
+#--------------------------------
+data2['drtype_all_vals']  = data2['drtype_numeric']
+data2['drtype_all_names'] = data2['drtype']
+
+# example: https://stackoverflow.com/questions/55125680/pandas-get-all-groupby-values-in-an-array
+# print(df.groupby('key').data.apply(list).reset_index()) # my use case, don't need the reset_index()
+data2['drtype_all_vals']  = data2.groupby('mutationinformation').drtype_all_vals.apply(list)
+data2['drtype_all_names'] = data2.groupby('mutationinformation').drtype_all_names.apply(list)
+
+#---------------------------------
+# Revised drtype: max(Multimode)
+#--------------------------------
+data2['drtype_multimode'] = data2.groupby(['mutationinformation'])['drtype_numeric'].agg(multimode)
+data2['drtype_multimode']
+
+# Now get the max from multimode
+data2['drtype_mode'] = data2['drtype_multimode'].apply(lambda x: np.nanmax(x))
+data2.head()
+
+#----------------------
+# Revised drtype: Max
+#----------------------
+data2.head()
+data2['drtype_max'] =  data2.groupby(['mutationinformation'])['drtype_numeric'].max()
+#data2 = data2.reset_index()
+data2.head()
+
+#%% Finally reset index
+data2 = data2.reset_index()
+#==============================================================================
+#---------------------------------------
+# Create revised mutation_info_column
+#---------------------------------------
+data2['dst_mode'].value_counts()
+data2[drug].value_counts()
+
+# note this is overriding, since downstream depends on it
+# make a copy you if you need to keep that
+data2['mutation_info_labels_orig'] =  data2['mutation_info_labels'] 
+data2['mutation_info_labels']  = data2['dst_mode'].map({1: 'DM'
+                                                    , 0: 'OM'})
+data2['mutation_info_labels_orig'].value_counts()
+data2['mutation_info_labels'].value_counts()
+#==============================================================================
+# sanity check
+if (all(data2['mutation'] == data2['mutationinformation'])):
+    print('\nPass: Mutationinformation check successful')
+else:
+    sys.exit('\nERROR: mutationin cross checks failed. Please check your group_by() aggregate functions')
+
+# Drop mutation column
+data2.drop(['mutation'], axis=1, inplace=True)
+
+#%% Process lineage info
+# add how many different lineages a sample is represented in?
+#%% subset: equivalent of merged_df3?
+# https://stackoverflow.com/questions/39900061/sort-lists-in-a-pandas-dataframe-column
+
+# result = data2['dst_multimode'].sort_values().apply(lambda x: sorted(x))
+# newdf = pd.DataFrame({'dst_multimode': Series(list(set(result['a'].apply(tuple))))})
+# newdf.sort_values(by='a')
+
+# data2['dst_multimode'].value_counts()
+# data2.sort_values(['dst_multimode'], ascending=False)
+
+data_df3 = data2.drop_duplicates(['mutationinformation'])
+data_df3_v2 = data2.drop_duplicates(['mutationinformation'])
+all(data_df3 == data_df3_v2)