saving work

2022-04-05 14:51:21 +01:00 · 2022-04-05 14:51:21 +01:00 · c647773520
commit c647773520
parent 6a9d23ec8f
3 changed files with 174 additions and 45 deletions
--- a/test_data/processing.py
+++ b/test_data/processing.py
@ -9,53 +9,182 @@ import sys, os
 import pandas as pd
 import numpy as np
 from statistics import mean, median, mode
-#from statistics import multimode
+from statistics import multimode
 from collections import Counter
-import math
+#import math
 # https://stackoverflow.com/questions/43321455/pandas-count-null-values-in-a-groupby-function
 # https://stackoverflow.com/questions/33457191/python-pandas-dataframe-fill-nans-with-a-conditional-mean
 # round up
 #int(math.ceil(mean(foo)))  
 #https://stackoverflow.com/questions/33457191/python-pandas-dataframe-fill-nans-with-a-conditional-mean
-#%%
+#%% Read data and formatting
 drug = "pyrazinamide"
 data = pd.read_csv("/home/tanu/git/ML_AI_training/test_data/sample_data.csv")
 data.columns
 # COPY mutation_info_labels column
 data['mutation_info_labels_orig'] = data['mutation_info_labels']
 # Convert DM/OM labels to numeric
 dm_om_map = {'DM': 1, 'OM': 0} # pnca, OM is minority, other genes: DM is minority
 data['dm_om_numeric'] = data['mutation_info_labels'].map(dm_om_map)
 # sanity check
-data['dm_om_numeric'].value_counts()
+data['dm_om_numeric'].value_counts() 
 data['mutation_info_labels'].value_counts()
 # Convert drtype column to numeric
 drtype_map = {'XDR': 5
              , 'Pre-XDR': 4
              , 'MDR': 3
              , 'Pre-MDR': 2
              , 'Other': 1
              , 'Sensitive': 0}
 data['drtype_numeric']  = data['drtype'].map(drtype_map)
 # COPY dst column
-data['dst'] = data[drug]
+data['dst'] = data[drug] # to allow cross checking
 data['dst_multimode'] = data[drug]
 # sanity check
 data[drug].value_counts()
-data[drug].isnull().sum()
+data['dst_multimode'].value_counts()
-data['dst'].value_counts()
+data[drug].isnull().sum()
-data['dst'].isnull().sum()
+data['dst_multimode'].isnull().sum()
 data['mutationinformation'].value_counts()
 #data.C.isnull().groupby([df['A'],df['B']]).sum().astype(int).reset_index(name='count')
 data[drug].isnull().groupby(data['mutationinformation']).sum()
 # GOAL is to populate na in the dst column from the count of the dm_om_numeric column                          
-data['dst'].isnull().groupby(data['mutationinformation']).sum()                       
+data['dst_multimode'].isnull().groupby(data['mutationinformation']).sum()                       
 # COPY mutationinformation for sanity check
 data['mutation'] = data['mutationinformation']
-# round up
+#%% POC: fill na with mean/mode/median/max for each mutation
 int(math.ceil(mean(foo)))  
 #https://stackoverflow.com/questions/33457191/python-pandas-dataframe-fill-nans-with-a-conditional-mean
 #FIXME
 # STAGE 1: replace mean with  Max(multimode), atm it is MEAN
 #na_val = data.groupby(data['mutationinformation'])['dst'].mean()
 data['dst_multimode'].fillna(data.groupby('mutationinformation')['dst_multimode'].transform('mean'))
 data['dst_multimode'].fillna(data.groupby('mutationinformation')['dm_om_numeric'].transform('mean'))
-data['dst'] = data['dst'].fillna(data.groupby('mutationinformation')['dst'].transform('mean'))
+# STAGE 2: Fill TRUE nan with DM.OM column value, atm it is MEAN
 #data['dst_mean_check'] = data['dst'].fillna(data.groupby('mutationinformation')['dst'].transform('mean'))
 #data['dst_mean_check'] = data['dst'].fillna(data.groupby('mutationinformation')['dm_om_numeric'].transform('mean'))
 #%% POC continued: Test getting mode
 #data.groupby('mutationinformation')['dm_om_numeric'].mode()
 data.groupby('mutationinformation')['dm_om_numeric'].agg(mode)
 data.groupby('mutationinformation')['dm_om_numeric'].agg(multimode)
 foo = data.groupby('mutationinformation')['dm_om_numeric'].agg(multimode)
 foo
 foo = foo.to_frame()
 foo['dm_om_numeric'].apply(lambda x: max(x))# returns nan
 foo['dm_om_numeric'].apply(lambda x: np.nanmax(x))
 #foo.assign(dst_mode = lambda x: (x['dst']))
 foo['multimode_extract'] = foo['dm_om_numeric'].apply(lambda x: max(x))
 foo['multimode_extract'] 
 #%% Recalculating columns [dst, drtype and mutation_info_labels]: SET Index as 'mutationinformation'
 data2 = data.copy()
 # Reset index as it allows the groupby expression to directly map
 data2 = data2.set_index(['mutationinformation'])
 #%% Recalculating dst: my data
 #------------------------------
 # Revised dst: max(multimode)
 #------------------------------
 # For each mutation, generate the revised dst which is the mode of dm_om_numeric
 # PROBLEM: Returns the smallest of the two when bimodal, and fails when all equally likely
 # SOLUTION: Using max of the 'dst_noNA' column
 #data2.groupby('mutationinformation')['dm_om_numeric'].agg(multimode)
-# FIXME
+# Get multimode for dm_om_numeric column
-#STAGE 2: Fill TRUE nan with DM.OM column value, atm it is MEAN
+dm_om_multimode = data2.groupby('mutationinformation')['dm_om_numeric'].agg(multimode)
-data['dst2'] = data['dst'].fillna(data.groupby('mutationinformation')['dm_om_numeric'].transform('mean'))
+#dm_om_multimode
-data['dst2'] = data['dst'].fillna(data.groupby('mutationinformation').transform(['dm_om_numeric']))
+
-     
+# Fill using multimode ONLY where NA in dst_multimode column
 #data2['dst_multimode'] = data2['dst_multimode'].fillna(dm_om_multimode)
 data2['dst_multimode'] = data2['dst_multimode'].fillna(dm_om_multimode)
 # data2['dst_multimode']
 # Now get the max from multimode
 data2['dst_noNA'] = data2['dst_multimode'].apply(lambda x: np.nanmax(x))
 print(data2)
 # Finally created a revised dst with the max from the multimode
 data2['dst_mode']  = data2.groupby('mutationinformation')['dst_noNA'].max()
 #==============================================================================
 #%% Recalculating drtype: my data
 #--------------------------------
 # drtype: ALL values:
 # numeric and names in an array 
 #--------------------------------
 data2['drtype_all_vals']  = data2['drtype_numeric']
 data2['drtype_all_names'] = data2['drtype']
 # example: https://stackoverflow.com/questions/55125680/pandas-get-all-groupby-values-in-an-array
 # print(df.groupby('key').data.apply(list).reset_index()) # my use case, don't need the reset_index()
 data2['drtype_all_vals']  = data2.groupby('mutationinformation').drtype_all_vals.apply(list)
 data2['drtype_all_names'] = data2.groupby('mutationinformation').drtype_all_names.apply(list)
 #---------------------------------
 # Revised drtype: max(Multimode)
 #--------------------------------
 data2['drtype_multimode'] = data2.groupby(['mutationinformation'])['drtype_numeric'].agg(multimode)
 data2['drtype_multimode']
 # Now get the max from multimode
 data2['drtype_mode'] = data2['drtype_multimode'].apply(lambda x: np.nanmax(x))
 data2.head()
 #----------------------
 # Revised drtype: Max
 #----------------------
 data2.head()
 data2['drtype_max'] =  data2.groupby(['mutationinformation'])['drtype_numeric'].max()
 #data2 = data2.reset_index()
 data2.head()
 #%% Finally reset index
 data2 = data2.reset_index()
 #==============================================================================
 #---------------------------------------
 # Create revised mutation_info_column
 #---------------------------------------
 data2['dst_mode'].value_counts()
 data2[drug].value_counts()
 # note this is overriding, since downstream depends on it
 # make a copy you if you need to keep that
 data2['mutation_info_labels_orig'] =  data2['mutation_info_labels'] 
 data2['mutation_info_labels']  = data2['dst_mode'].map({1: 'DM'
                                                    , 0: 'OM'})
 data2['mutation_info_labels_orig'].value_counts()
 data2['mutation_info_labels'].value_counts()
 #==============================================================================
 # sanity check
 if (all(data2['mutation'] == data2['mutationinformation'])):
    print('\nPass: Mutationinformation check successful')
 else:
    sys.exit('\nERROR: mutationin cross checks failed. Please check your group_by() aggregate functions')
 # Drop mutation column
 data2.drop(['mutation'], axis=1, inplace=True)
 #%% Process lineage info
 # add how many different lineages a sample is represented in?
 #%% subset: equivalent of merged_df3?
 # https://stackoverflow.com/questions/39900061/sort-lists-in-a-pandas-dataframe-column
 # result = data2['dst_multimode'].sort_values().apply(lambda x: sorted(x))
 # newdf = pd.DataFrame({'dst_multimode': Series(list(set(result['a'].apply(tuple))))})
 # newdf.sort_values(by='a')
 # data2['dst_multimode'].value_counts()
 # data2.sort_values(['dst_multimode'], ascending=False)
 data_df3 = data2.drop_duplicates(['mutationinformation'])
 data_df3_v2 = data2.drop_duplicates(['mutationinformation'])
 all(data_df3 == data_df3_v2)
--- a/test_data/sample_data.csv
+++ b/test_data/sample_data.csv
@ -1,26 +1,26 @@
-id,mutationinformation,position,pyrazinamide,mutation_info_labels,drtype
+sample,mutationinformation,position,pyrazinamide,mutation_info_labels,drtype,lineage
-S1,M1A,1,0,DM,MDR
+S1,M1A,1,0,DM,MDR,l1
-S2,M1A,1,1,DM,Pre-MDR
+S2,M1A,1,1,DM,Pre-MDR,l2
-S3,M1A,1,1,OM,Sensitive
+S3,M1A,1,1,OM,Sensitive,l1
-S4,M1A,1,NA,OM,Others
+S4,M1A,1,NA,OM,Other,l3
-S5,M1A,1,1,OM,Pre-XDR
+S5,M1A,1,1,OM,Pre-XDR,l2
-S6,M1A,1,1,DM,XDR
+S6,M1A,1,1,DM,XDR,l4
-S7,M1B,1,NA,OM,MDR
+S7,M1B,1,NA,OM,MDR,l1
-S8,M1B,1,1,DM,MDR
+S8,M1B,1,1,DM,Other,l1
-S9,M1B,1,NA,DM,Other
+S9,M1B,1,NA,DM,Other,l2
-S10,M1B,1,0,OM,Sensitive
+S10,M1B,1,0,OM,Sensitive,l2
-S11,M1C,1,NA,OM,Pre-XDR
+S11,M1C,1,NA,OM,Pre-XDR,l3
-S12,M1C,1,NA,OM,Pre-XDR
+S12,M1C,1,NA,OM,Pre-XDR,l1
-S13,M1C,1,1,OM,MDR
+S13,M1C,1,1,OM,MDR,l1
-S14,M1C,1,NA,DM,MDR
+S14,M1C,1,NA,DM,MDR,l2
-S15,A2B,2,0,OM,Others
+S15,A2B,2,0,OM,Other,l4
-S16,A2B,2,0,OM,XDR
+S16,A2B,2,0,OM,XDR,l4
-S17,A2C,2,NA,DM,Pre-MDR
+S17,A2C,2,NA,DM,Pre-MDR,l5
-S18,A2C,2,1,DM,Pre-MDR
+S18,A2C,2,1,DM,Pre-MDR,l1
-S19,D3E,3,1,DM,XDR
+S19,D3E,3,1,DM,XDR,l2
-S20,D3E,3,NA,DM,MDR
+S20,D3E,3,NA,DM,MDR,l2
-S21,D3E,3,NA,OM,Pre-MDR
+S21,D3E,3,NA,OM,Pre-MDR,l1
-S22,D3P,3,0,OM,Pre-MDR
+S22,D3P,3,0,OM,Pre-MDR,l2
-S23,D3A,3,0,OM,Sensitive
+S23,D3A,3,0,OM,Sensitive,l5
-S24,P4A,4,NA,OM,Others
+S24,P4A,4,NA,OM,Other,l6
-S25,P5A,5,1,DM,Sensitive
+S25,P5A,5,1,DM,Sensitive,l4
--- a/test_data/sample_data.ods
+++ b/test_data/sample_data.ods