saving work

This commit is contained in:
Tanushree Tunstall 2022-04-05 14:51:21 +01:00
parent 6a9d23ec8f
commit c647773520
3 changed files with 174 additions and 45 deletions

View file

@ -9,53 +9,182 @@ import sys, os
import pandas as pd import pandas as pd
import numpy as np import numpy as np
from statistics import mean, median, mode from statistics import mean, median, mode
#from statistics import multimode from statistics import multimode
from collections import Counter from collections import Counter
import math #import math
# https://stackoverflow.com/questions/43321455/pandas-count-null-values-in-a-groupby-function # https://stackoverflow.com/questions/43321455/pandas-count-null-values-in-a-groupby-function
# https://stackoverflow.com/questions/33457191/python-pandas-dataframe-fill-nans-with-a-conditional-mean
# round up
#int(math.ceil(mean(foo)))
#https://stackoverflow.com/questions/33457191/python-pandas-dataframe-fill-nans-with-a-conditional-mean #https://stackoverflow.com/questions/33457191/python-pandas-dataframe-fill-nans-with-a-conditional-mean
#%% #%% Read data and formatting
drug = "pyrazinamide" drug = "pyrazinamide"
data = pd.read_csv("/home/tanu/git/ML_AI_training/test_data/sample_data.csv") data = pd.read_csv("/home/tanu/git/ML_AI_training/test_data/sample_data.csv")
data.columns data.columns
# COPY mutation_info_labels column
data['mutation_info_labels_orig'] = data['mutation_info_labels']
# Convert DM/OM labels to numeric # Convert DM/OM labels to numeric
dm_om_map = {'DM': 1, 'OM': 0} # pnca, OM is minority, other genes: DM is minority dm_om_map = {'DM': 1, 'OM': 0} # pnca, OM is minority, other genes: DM is minority
data['dm_om_numeric'] = data['mutation_info_labels'].map(dm_om_map) data['dm_om_numeric'] = data['mutation_info_labels'].map(dm_om_map)
# sanity check # sanity check
data['dm_om_numeric'].value_counts() data['dm_om_numeric'].value_counts()
data['mutation_info_labels'].value_counts() data['mutation_info_labels'].value_counts()
# Convert drtype column to numeric
drtype_map = {'XDR': 5
, 'Pre-XDR': 4
, 'MDR': 3
, 'Pre-MDR': 2
, 'Other': 1
, 'Sensitive': 0}
data['drtype_numeric'] = data['drtype'].map(drtype_map)
# COPY dst column # COPY dst column
data['dst'] = data[drug] data['dst'] = data[drug] # to allow cross checking
data['dst_multimode'] = data[drug]
# sanity check # sanity check
data[drug].value_counts() data[drug].value_counts()
data[drug].isnull().sum() data['dst_multimode'].value_counts()
data['dst'].value_counts() data[drug].isnull().sum()
data['dst'].isnull().sum() data['dst_multimode'].isnull().sum()
data['mutationinformation'].value_counts() data['mutationinformation'].value_counts()
#data.C.isnull().groupby([df['A'],df['B']]).sum().astype(int).reset_index(name='count') #data.C.isnull().groupby([df['A'],df['B']]).sum().astype(int).reset_index(name='count')
data[drug].isnull().groupby(data['mutationinformation']).sum() data[drug].isnull().groupby(data['mutationinformation']).sum()
# GOAL is to populate na in the dst column from the count of the dm_om_numeric column # GOAL is to populate na in the dst column from the count of the dm_om_numeric column
data['dst'].isnull().groupby(data['mutationinformation']).sum() data['dst_multimode'].isnull().groupby(data['mutationinformation']).sum()
# COPY mutationinformation for sanity check
data['mutation'] = data['mutationinformation']
# round up #%% POC: fill na with mean/mode/median/max for each mutation
int(math.ceil(mean(foo)))
#https://stackoverflow.com/questions/33457191/python-pandas-dataframe-fill-nans-with-a-conditional-mean
#FIXME
# STAGE 1: replace mean with Max(multimode), atm it is MEAN # STAGE 1: replace mean with Max(multimode), atm it is MEAN
#na_val = data.groupby(data['mutationinformation'])['dst'].mean() #na_val = data.groupby(data['mutationinformation'])['dst'].mean()
data['dst_multimode'].fillna(data.groupby('mutationinformation')['dst_multimode'].transform('mean'))
data['dst_multimode'].fillna(data.groupby('mutationinformation')['dm_om_numeric'].transform('mean'))
data['dst'] = data['dst'].fillna(data.groupby('mutationinformation')['dst'].transform('mean')) # STAGE 2: Fill TRUE nan with DM.OM column value, atm it is MEAN
#data['dst_mean_check'] = data['dst'].fillna(data.groupby('mutationinformation')['dst'].transform('mean'))
#data['dst_mean_check'] = data['dst'].fillna(data.groupby('mutationinformation')['dm_om_numeric'].transform('mean'))
#%% POC continued: Test getting mode
#data.groupby('mutationinformation')['dm_om_numeric'].mode()
data.groupby('mutationinformation')['dm_om_numeric'].agg(mode)
data.groupby('mutationinformation')['dm_om_numeric'].agg(multimode)
foo = data.groupby('mutationinformation')['dm_om_numeric'].agg(multimode)
foo
foo = foo.to_frame()
foo['dm_om_numeric'].apply(lambda x: max(x))# returns nan
foo['dm_om_numeric'].apply(lambda x: np.nanmax(x))
#foo.assign(dst_mode = lambda x: (x['dst']))
foo['multimode_extract'] = foo['dm_om_numeric'].apply(lambda x: max(x))
foo['multimode_extract']
#%% Recalculating columns [dst, drtype and mutation_info_labels]: SET Index as 'mutationinformation'
data2 = data.copy()
# Reset index as it allows the groupby expression to directly map
data2 = data2.set_index(['mutationinformation'])
#%% Recalculating dst: my data
#------------------------------
# Revised dst: max(multimode)
#------------------------------
# For each mutation, generate the revised dst which is the mode of dm_om_numeric
# PROBLEM: Returns the smallest of the two when bimodal, and fails when all equally likely
# SOLUTION: Using max of the 'dst_noNA' column
#data2.groupby('mutationinformation')['dm_om_numeric'].agg(multimode)
# FIXME # Get multimode for dm_om_numeric column
#STAGE 2: Fill TRUE nan with DM.OM column value, atm it is MEAN dm_om_multimode = data2.groupby('mutationinformation')['dm_om_numeric'].agg(multimode)
data['dst2'] = data['dst'].fillna(data.groupby('mutationinformation')['dm_om_numeric'].transform('mean')) #dm_om_multimode
data['dst2'] = data['dst'].fillna(data.groupby('mutationinformation').transform(['dm_om_numeric']))
# Fill using multimode ONLY where NA in dst_multimode column
#data2['dst_multimode'] = data2['dst_multimode'].fillna(dm_om_multimode)
data2['dst_multimode'] = data2['dst_multimode'].fillna(dm_om_multimode)
# data2['dst_multimode']
# Now get the max from multimode
data2['dst_noNA'] = data2['dst_multimode'].apply(lambda x: np.nanmax(x))
print(data2)
# Finally created a revised dst with the max from the multimode
data2['dst_mode'] = data2.groupby('mutationinformation')['dst_noNA'].max()
#==============================================================================
#%% Recalculating drtype: my data
#--------------------------------
# drtype: ALL values:
# numeric and names in an array
#--------------------------------
data2['drtype_all_vals'] = data2['drtype_numeric']
data2['drtype_all_names'] = data2['drtype']
# example: https://stackoverflow.com/questions/55125680/pandas-get-all-groupby-values-in-an-array
# print(df.groupby('key').data.apply(list).reset_index()) # my use case, don't need the reset_index()
data2['drtype_all_vals'] = data2.groupby('mutationinformation').drtype_all_vals.apply(list)
data2['drtype_all_names'] = data2.groupby('mutationinformation').drtype_all_names.apply(list)
#---------------------------------
# Revised drtype: max(Multimode)
#--------------------------------
data2['drtype_multimode'] = data2.groupby(['mutationinformation'])['drtype_numeric'].agg(multimode)
data2['drtype_multimode']
# Now get the max from multimode
data2['drtype_mode'] = data2['drtype_multimode'].apply(lambda x: np.nanmax(x))
data2.head()
#----------------------
# Revised drtype: Max
#----------------------
data2.head()
data2['drtype_max'] = data2.groupby(['mutationinformation'])['drtype_numeric'].max()
#data2 = data2.reset_index()
data2.head()
#%% Finally reset index
data2 = data2.reset_index()
#==============================================================================
#---------------------------------------
# Create revised mutation_info_column
#---------------------------------------
data2['dst_mode'].value_counts()
data2[drug].value_counts()
# note this is overriding, since downstream depends on it
# make a copy you if you need to keep that
data2['mutation_info_labels_orig'] = data2['mutation_info_labels']
data2['mutation_info_labels'] = data2['dst_mode'].map({1: 'DM'
, 0: 'OM'})
data2['mutation_info_labels_orig'].value_counts()
data2['mutation_info_labels'].value_counts()
#==============================================================================
# sanity check
if (all(data2['mutation'] == data2['mutationinformation'])):
print('\nPass: Mutationinformation check successful')
else:
sys.exit('\nERROR: mutationin cross checks failed. Please check your group_by() aggregate functions')
# Drop mutation column
data2.drop(['mutation'], axis=1, inplace=True)
#%% Process lineage info
# add how many different lineages a sample is represented in?
#%% subset: equivalent of merged_df3?
# https://stackoverflow.com/questions/39900061/sort-lists-in-a-pandas-dataframe-column
# result = data2['dst_multimode'].sort_values().apply(lambda x: sorted(x))
# newdf = pd.DataFrame({'dst_multimode': Series(list(set(result['a'].apply(tuple))))})
# newdf.sort_values(by='a')
# data2['dst_multimode'].value_counts()
# data2.sort_values(['dst_multimode'], ascending=False)
data_df3 = data2.drop_duplicates(['mutationinformation'])
data_df3_v2 = data2.drop_duplicates(['mutationinformation'])
all(data_df3 == data_df3_v2)

View file

@ -1,26 +1,26 @@
id,mutationinformation,position,pyrazinamide,mutation_info_labels,drtype sample,mutationinformation,position,pyrazinamide,mutation_info_labels,drtype,lineage
S1,M1A,1,0,DM,MDR S1,M1A,1,0,DM,MDR,l1
S2,M1A,1,1,DM,Pre-MDR S2,M1A,1,1,DM,Pre-MDR,l2
S3,M1A,1,1,OM,Sensitive S3,M1A,1,1,OM,Sensitive,l1
S4,M1A,1,NA,OM,Others S4,M1A,1,NA,OM,Other,l3
S5,M1A,1,1,OM,Pre-XDR S5,M1A,1,1,OM,Pre-XDR,l2
S6,M1A,1,1,DM,XDR S6,M1A,1,1,DM,XDR,l4
S7,M1B,1,NA,OM,MDR S7,M1B,1,NA,OM,MDR,l1
S8,M1B,1,1,DM,MDR S8,M1B,1,1,DM,Other,l1
S9,M1B,1,NA,DM,Other S9,M1B,1,NA,DM,Other,l2
S10,M1B,1,0,OM,Sensitive S10,M1B,1,0,OM,Sensitive,l2
S11,M1C,1,NA,OM,Pre-XDR S11,M1C,1,NA,OM,Pre-XDR,l3
S12,M1C,1,NA,OM,Pre-XDR S12,M1C,1,NA,OM,Pre-XDR,l1
S13,M1C,1,1,OM,MDR S13,M1C,1,1,OM,MDR,l1
S14,M1C,1,NA,DM,MDR S14,M1C,1,NA,DM,MDR,l2
S15,A2B,2,0,OM,Others S15,A2B,2,0,OM,Other,l4
S16,A2B,2,0,OM,XDR S16,A2B,2,0,OM,XDR,l4
S17,A2C,2,NA,DM,Pre-MDR S17,A2C,2,NA,DM,Pre-MDR,l5
S18,A2C,2,1,DM,Pre-MDR S18,A2C,2,1,DM,Pre-MDR,l1
S19,D3E,3,1,DM,XDR S19,D3E,3,1,DM,XDR,l2
S20,D3E,3,NA,DM,MDR S20,D3E,3,NA,DM,MDR,l2
S21,D3E,3,NA,OM,Pre-MDR S21,D3E,3,NA,OM,Pre-MDR,l1
S22,D3P,3,0,OM,Pre-MDR S22,D3P,3,0,OM,Pre-MDR,l2
S23,D3A,3,0,OM,Sensitive S23,D3A,3,0,OM,Sensitive,l5
S24,P4A,4,NA,OM,Others S24,P4A,4,NA,OM,Other,l6
S25,P5A,5,1,DM,Sensitive S25,P5A,5,1,DM,Sensitive,l4

1 id sample mutationinformation position pyrazinamide mutation_info_labels drtype lineage
2 S1 M1A 1 0 DM MDR l1
3 S2 M1A 1 1 DM Pre-MDR l2
4 S3 M1A 1 1 OM Sensitive l1
5 S4 M1A 1 NA OM Others Other l3
6 S5 M1A 1 1 OM Pre-XDR l2
7 S6 M1A 1 1 DM XDR l4
8 S7 M1B 1 NA OM MDR l1
9 S8 M1B 1 1 DM MDR Other l1
10 S9 M1B 1 NA DM Other l2
11 S10 M1B 1 0 OM Sensitive l2
12 S11 M1C 1 NA OM Pre-XDR l3
13 S12 M1C 1 NA OM Pre-XDR l1
14 S13 M1C 1 1 OM MDR l1
15 S14 M1C 1 NA DM MDR l2
16 S15 A2B 2 0 OM Others Other l4
17 S16 A2B 2 0 OM XDR l4
18 S17 A2C 2 NA DM Pre-MDR l5
19 S18 A2C 2 1 DM Pre-MDR l1
20 S19 D3E 3 1 DM XDR l2
21 S20 D3E 3 NA DM MDR l2
22 S21 D3E 3 NA OM Pre-MDR l1
23 S22 D3P 3 0 OM Pre-MDR l2
24 S23 D3A 3 0 OM Sensitive l5
25 S24 P4A 4 NA OM Others Other l6
26 S25 P5A 5 1 DM Sensitive l4

Binary file not shown.