saving work
This commit is contained in:
parent
6a9d23ec8f
commit
c647773520
3 changed files with 174 additions and 45 deletions
|
@ -9,53 +9,182 @@ import sys, os
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from statistics import mean, median, mode
|
from statistics import mean, median, mode
|
||||||
#from statistics import multimode
|
from statistics import multimode
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
import math
|
#import math
|
||||||
|
|
||||||
# https://stackoverflow.com/questions/43321455/pandas-count-null-values-in-a-groupby-function
|
# https://stackoverflow.com/questions/43321455/pandas-count-null-values-in-a-groupby-function
|
||||||
|
# https://stackoverflow.com/questions/33457191/python-pandas-dataframe-fill-nans-with-a-conditional-mean
|
||||||
|
# round up
|
||||||
|
#int(math.ceil(mean(foo)))
|
||||||
#https://stackoverflow.com/questions/33457191/python-pandas-dataframe-fill-nans-with-a-conditional-mean
|
#https://stackoverflow.com/questions/33457191/python-pandas-dataframe-fill-nans-with-a-conditional-mean
|
||||||
#%%
|
#%% Read data and formatting
|
||||||
drug = "pyrazinamide"
|
drug = "pyrazinamide"
|
||||||
|
|
||||||
data = pd.read_csv("/home/tanu/git/ML_AI_training/test_data/sample_data.csv")
|
data = pd.read_csv("/home/tanu/git/ML_AI_training/test_data/sample_data.csv")
|
||||||
data.columns
|
data.columns
|
||||||
|
|
||||||
|
# COPY mutation_info_labels column
|
||||||
|
data['mutation_info_labels_orig'] = data['mutation_info_labels']
|
||||||
|
|
||||||
# Convert DM/OM labels to numeric
|
# Convert DM/OM labels to numeric
|
||||||
dm_om_map = {'DM': 1, 'OM': 0} # pnca, OM is minority, other genes: DM is minority
|
dm_om_map = {'DM': 1, 'OM': 0} # pnca, OM is minority, other genes: DM is minority
|
||||||
data['dm_om_numeric'] = data['mutation_info_labels'].map(dm_om_map)
|
data['dm_om_numeric'] = data['mutation_info_labels'].map(dm_om_map)
|
||||||
# sanity check
|
# sanity check
|
||||||
data['dm_om_numeric'].value_counts()
|
data['dm_om_numeric'].value_counts()
|
||||||
data['mutation_info_labels'].value_counts()
|
data['mutation_info_labels'].value_counts()
|
||||||
|
|
||||||
|
# Convert drtype column to numeric
|
||||||
|
drtype_map = {'XDR': 5
|
||||||
|
, 'Pre-XDR': 4
|
||||||
|
, 'MDR': 3
|
||||||
|
, 'Pre-MDR': 2
|
||||||
|
, 'Other': 1
|
||||||
|
, 'Sensitive': 0}
|
||||||
|
|
||||||
|
data['drtype_numeric'] = data['drtype'].map(drtype_map)
|
||||||
|
|
||||||
# COPY dst column
|
# COPY dst column
|
||||||
data['dst'] = data[drug]
|
data['dst'] = data[drug] # to allow cross checking
|
||||||
|
data['dst_multimode'] = data[drug]
|
||||||
|
|
||||||
# sanity check
|
# sanity check
|
||||||
data[drug].value_counts()
|
data[drug].value_counts()
|
||||||
data[drug].isnull().sum()
|
data['dst_multimode'].value_counts()
|
||||||
|
|
||||||
data['dst'].value_counts()
|
data[drug].isnull().sum()
|
||||||
data['dst'].isnull().sum()
|
data['dst_multimode'].isnull().sum()
|
||||||
|
|
||||||
data['mutationinformation'].value_counts()
|
data['mutationinformation'].value_counts()
|
||||||
#data.C.isnull().groupby([df['A'],df['B']]).sum().astype(int).reset_index(name='count')
|
#data.C.isnull().groupby([df['A'],df['B']]).sum().astype(int).reset_index(name='count')
|
||||||
data[drug].isnull().groupby(data['mutationinformation']).sum()
|
data[drug].isnull().groupby(data['mutationinformation']).sum()
|
||||||
|
|
||||||
# GOAL is to populate na in the dst column from the count of the dm_om_numeric column
|
# GOAL is to populate na in the dst column from the count of the dm_om_numeric column
|
||||||
data['dst'].isnull().groupby(data['mutationinformation']).sum()
|
data['dst_multimode'].isnull().groupby(data['mutationinformation']).sum()
|
||||||
|
|
||||||
|
# COPY mutationinformation for sanity check
|
||||||
|
data['mutation'] = data['mutationinformation']
|
||||||
|
|
||||||
# round up
|
#%% POC: fill na with mean/mode/median/max for each mutation
|
||||||
int(math.ceil(mean(foo)))
|
|
||||||
#https://stackoverflow.com/questions/33457191/python-pandas-dataframe-fill-nans-with-a-conditional-mean
|
|
||||||
#FIXME
|
|
||||||
# STAGE 1: replace mean with Max(multimode), atm it is MEAN
|
# STAGE 1: replace mean with Max(multimode), atm it is MEAN
|
||||||
#na_val = data.groupby(data['mutationinformation'])['dst'].mean()
|
#na_val = data.groupby(data['mutationinformation'])['dst'].mean()
|
||||||
|
data['dst_multimode'].fillna(data.groupby('mutationinformation')['dst_multimode'].transform('mean'))
|
||||||
|
data['dst_multimode'].fillna(data.groupby('mutationinformation')['dm_om_numeric'].transform('mean'))
|
||||||
|
|
||||||
data['dst'] = data['dst'].fillna(data.groupby('mutationinformation')['dst'].transform('mean'))
|
# STAGE 2: Fill TRUE nan with DM.OM column value, atm it is MEAN
|
||||||
|
#data['dst_mean_check'] = data['dst'].fillna(data.groupby('mutationinformation')['dst'].transform('mean'))
|
||||||
|
#data['dst_mean_check'] = data['dst'].fillna(data.groupby('mutationinformation')['dm_om_numeric'].transform('mean'))
|
||||||
|
#%% POC continued: Test getting mode
|
||||||
|
#data.groupby('mutationinformation')['dm_om_numeric'].mode()
|
||||||
|
data.groupby('mutationinformation')['dm_om_numeric'].agg(mode)
|
||||||
|
data.groupby('mutationinformation')['dm_om_numeric'].agg(multimode)
|
||||||
|
foo = data.groupby('mutationinformation')['dm_om_numeric'].agg(multimode)
|
||||||
|
foo
|
||||||
|
foo = foo.to_frame()
|
||||||
|
foo['dm_om_numeric'].apply(lambda x: max(x))# returns nan
|
||||||
|
foo['dm_om_numeric'].apply(lambda x: np.nanmax(x))
|
||||||
|
#foo.assign(dst_mode = lambda x: (x['dst']))
|
||||||
|
foo['multimode_extract'] = foo['dm_om_numeric'].apply(lambda x: max(x))
|
||||||
|
foo['multimode_extract']
|
||||||
|
#%% Recalculating columns [dst, drtype and mutation_info_labels]: SET Index as 'mutationinformation'
|
||||||
|
data2 = data.copy()
|
||||||
|
# Reset index as it allows the groupby expression to directly map
|
||||||
|
data2 = data2.set_index(['mutationinformation'])
|
||||||
|
#%% Recalculating dst: my data
|
||||||
|
#------------------------------
|
||||||
|
# Revised dst: max(multimode)
|
||||||
|
#------------------------------
|
||||||
|
# For each mutation, generate the revised dst which is the mode of dm_om_numeric
|
||||||
|
# PROBLEM: Returns the smallest of the two when bimodal, and fails when all equally likely
|
||||||
|
# SOLUTION: Using max of the 'dst_noNA' column
|
||||||
|
#data2.groupby('mutationinformation')['dm_om_numeric'].agg(multimode)
|
||||||
|
|
||||||
# FIXME
|
# Get multimode for dm_om_numeric column
|
||||||
#STAGE 2: Fill TRUE nan with DM.OM column value, atm it is MEAN
|
dm_om_multimode = data2.groupby('mutationinformation')['dm_om_numeric'].agg(multimode)
|
||||||
data['dst2'] = data['dst'].fillna(data.groupby('mutationinformation')['dm_om_numeric'].transform('mean'))
|
#dm_om_multimode
|
||||||
data['dst2'] = data['dst'].fillna(data.groupby('mutationinformation').transform(['dm_om_numeric']))
|
|
||||||
|
# Fill using multimode ONLY where NA in dst_multimode column
|
||||||
|
#data2['dst_multimode'] = data2['dst_multimode'].fillna(dm_om_multimode)
|
||||||
|
data2['dst_multimode'] = data2['dst_multimode'].fillna(dm_om_multimode)
|
||||||
|
|
||||||
|
# data2['dst_multimode']
|
||||||
|
|
||||||
|
# Now get the max from multimode
|
||||||
|
data2['dst_noNA'] = data2['dst_multimode'].apply(lambda x: np.nanmax(x))
|
||||||
|
print(data2)
|
||||||
|
|
||||||
|
# Finally created a revised dst with the max from the multimode
|
||||||
|
data2['dst_mode'] = data2.groupby('mutationinformation')['dst_noNA'].max()
|
||||||
|
#==============================================================================
|
||||||
|
#%% Recalculating drtype: my data
|
||||||
|
#--------------------------------
|
||||||
|
# drtype: ALL values:
|
||||||
|
# numeric and names in an array
|
||||||
|
#--------------------------------
|
||||||
|
data2['drtype_all_vals'] = data2['drtype_numeric']
|
||||||
|
data2['drtype_all_names'] = data2['drtype']
|
||||||
|
|
||||||
|
# example: https://stackoverflow.com/questions/55125680/pandas-get-all-groupby-values-in-an-array
|
||||||
|
# print(df.groupby('key').data.apply(list).reset_index()) # my use case, don't need the reset_index()
|
||||||
|
data2['drtype_all_vals'] = data2.groupby('mutationinformation').drtype_all_vals.apply(list)
|
||||||
|
data2['drtype_all_names'] = data2.groupby('mutationinformation').drtype_all_names.apply(list)
|
||||||
|
|
||||||
|
#---------------------------------
|
||||||
|
# Revised drtype: max(Multimode)
|
||||||
|
#--------------------------------
|
||||||
|
data2['drtype_multimode'] = data2.groupby(['mutationinformation'])['drtype_numeric'].agg(multimode)
|
||||||
|
data2['drtype_multimode']
|
||||||
|
|
||||||
|
# Now get the max from multimode
|
||||||
|
data2['drtype_mode'] = data2['drtype_multimode'].apply(lambda x: np.nanmax(x))
|
||||||
|
data2.head()
|
||||||
|
|
||||||
|
#----------------------
|
||||||
|
# Revised drtype: Max
|
||||||
|
#----------------------
|
||||||
|
data2.head()
|
||||||
|
data2['drtype_max'] = data2.groupby(['mutationinformation'])['drtype_numeric'].max()
|
||||||
|
#data2 = data2.reset_index()
|
||||||
|
data2.head()
|
||||||
|
|
||||||
|
#%% Finally reset index
|
||||||
|
data2 = data2.reset_index()
|
||||||
|
#==============================================================================
|
||||||
|
#---------------------------------------
|
||||||
|
# Create revised mutation_info_column
|
||||||
|
#---------------------------------------
|
||||||
|
data2['dst_mode'].value_counts()
|
||||||
|
data2[drug].value_counts()
|
||||||
|
|
||||||
|
# note this is overriding, since downstream depends on it
|
||||||
|
# make a copy you if you need to keep that
|
||||||
|
data2['mutation_info_labels_orig'] = data2['mutation_info_labels']
|
||||||
|
data2['mutation_info_labels'] = data2['dst_mode'].map({1: 'DM'
|
||||||
|
, 0: 'OM'})
|
||||||
|
data2['mutation_info_labels_orig'].value_counts()
|
||||||
|
data2['mutation_info_labels'].value_counts()
|
||||||
|
#==============================================================================
|
||||||
|
# sanity check
|
||||||
|
if (all(data2['mutation'] == data2['mutationinformation'])):
|
||||||
|
print('\nPass: Mutationinformation check successful')
|
||||||
|
else:
|
||||||
|
sys.exit('\nERROR: mutationin cross checks failed. Please check your group_by() aggregate functions')
|
||||||
|
|
||||||
|
# Drop mutation column
|
||||||
|
data2.drop(['mutation'], axis=1, inplace=True)
|
||||||
|
|
||||||
|
#%% Process lineage info
|
||||||
|
# add how many different lineages a sample is represented in?
|
||||||
|
#%% subset: equivalent of merged_df3?
|
||||||
|
# https://stackoverflow.com/questions/39900061/sort-lists-in-a-pandas-dataframe-column
|
||||||
|
|
||||||
|
# result = data2['dst_multimode'].sort_values().apply(lambda x: sorted(x))
|
||||||
|
# newdf = pd.DataFrame({'dst_multimode': Series(list(set(result['a'].apply(tuple))))})
|
||||||
|
# newdf.sort_values(by='a')
|
||||||
|
|
||||||
|
# data2['dst_multimode'].value_counts()
|
||||||
|
# data2.sort_values(['dst_multimode'], ascending=False)
|
||||||
|
|
||||||
|
data_df3 = data2.drop_duplicates(['mutationinformation'])
|
||||||
|
data_df3_v2 = data2.drop_duplicates(['mutationinformation'])
|
||||||
|
all(data_df3 == data_df3_v2)
|
|
@ -1,26 +1,26 @@
|
||||||
id,mutationinformation,position,pyrazinamide,mutation_info_labels,drtype
|
sample,mutationinformation,position,pyrazinamide,mutation_info_labels,drtype,lineage
|
||||||
S1,M1A,1,0,DM,MDR
|
S1,M1A,1,0,DM,MDR,l1
|
||||||
S2,M1A,1,1,DM,Pre-MDR
|
S2,M1A,1,1,DM,Pre-MDR,l2
|
||||||
S3,M1A,1,1,OM,Sensitive
|
S3,M1A,1,1,OM,Sensitive,l1
|
||||||
S4,M1A,1,NA,OM,Others
|
S4,M1A,1,NA,OM,Other,l3
|
||||||
S5,M1A,1,1,OM,Pre-XDR
|
S5,M1A,1,1,OM,Pre-XDR,l2
|
||||||
S6,M1A,1,1,DM,XDR
|
S6,M1A,1,1,DM,XDR,l4
|
||||||
S7,M1B,1,NA,OM,MDR
|
S7,M1B,1,NA,OM,MDR,l1
|
||||||
S8,M1B,1,1,DM,MDR
|
S8,M1B,1,1,DM,Other,l1
|
||||||
S9,M1B,1,NA,DM,Other
|
S9,M1B,1,NA,DM,Other,l2
|
||||||
S10,M1B,1,0,OM,Sensitive
|
S10,M1B,1,0,OM,Sensitive,l2
|
||||||
S11,M1C,1,NA,OM,Pre-XDR
|
S11,M1C,1,NA,OM,Pre-XDR,l3
|
||||||
S12,M1C,1,NA,OM,Pre-XDR
|
S12,M1C,1,NA,OM,Pre-XDR,l1
|
||||||
S13,M1C,1,1,OM,MDR
|
S13,M1C,1,1,OM,MDR,l1
|
||||||
S14,M1C,1,NA,DM,MDR
|
S14,M1C,1,NA,DM,MDR,l2
|
||||||
S15,A2B,2,0,OM,Others
|
S15,A2B,2,0,OM,Other,l4
|
||||||
S16,A2B,2,0,OM,XDR
|
S16,A2B,2,0,OM,XDR,l4
|
||||||
S17,A2C,2,NA,DM,Pre-MDR
|
S17,A2C,2,NA,DM,Pre-MDR,l5
|
||||||
S18,A2C,2,1,DM,Pre-MDR
|
S18,A2C,2,1,DM,Pre-MDR,l1
|
||||||
S19,D3E,3,1,DM,XDR
|
S19,D3E,3,1,DM,XDR,l2
|
||||||
S20,D3E,3,NA,DM,MDR
|
S20,D3E,3,NA,DM,MDR,l2
|
||||||
S21,D3E,3,NA,OM,Pre-MDR
|
S21,D3E,3,NA,OM,Pre-MDR,l1
|
||||||
S22,D3P,3,0,OM,Pre-MDR
|
S22,D3P,3,0,OM,Pre-MDR,l2
|
||||||
S23,D3A,3,0,OM,Sensitive
|
S23,D3A,3,0,OM,Sensitive,l5
|
||||||
S24,P4A,4,NA,OM,Others
|
S24,P4A,4,NA,OM,Other,l6
|
||||||
S25,P5A,5,1,DM,Sensitive
|
S25,P5A,5,1,DM,Sensitive,l4
|
||||||
|
|
|
Binary file not shown.
Loading…
Add table
Add a link
Reference in a new issue