mostly done, now adding lineage magicry
This commit is contained in:
parent
f05cb96346
commit
ae3a5500c9
1 changed files with 302 additions and 42 deletions
|
@ -1440,79 +1440,339 @@ else:
|
|||
|
||||
# clear variables
|
||||
del(k, v, wt, mut, lookup_dict)
|
||||
#%% NEW mappings: gene_LF2
|
||||
# gene_LF2: copy gene_LF1
|
||||
gene_LF2 = gene_LF1.copy()
|
||||
|
||||
#%% NEW TODO: map mutationinformation
|
||||
#%% AF for gene
|
||||
gene_LF2['id'].nunique()
|
||||
gene_LF2['mutationinformation'].nunique()
|
||||
total_id_ucount = gene_LF2['id'].nunique()
|
||||
total_id_ucount
|
||||
total_id_ucount2 = gene_LF2['sample'].nunique()
|
||||
total_id_ucount2
|
||||
|
||||
#%% NEW: mappings
|
||||
if total_id_ucount == total_id_ucount2:
|
||||
print('\nPASS: sample and id unique counts match')
|
||||
else:
|
||||
print('\nFAIL: sample and id unique counts DO NOT match!'
|
||||
, '\nGWAS worry!?')
|
||||
|
||||
# Add all sample ids in a list for sanity checks
|
||||
#gene_LF2['id_list'] = gene_LF2['mutationinformation'].map(gene_LF2.groupby('mutationinformation')['id'].apply(list))
|
||||
|
||||
#=======================================
|
||||
# Add column: total unique id/sample count
|
||||
#=======================================
|
||||
gene_LF2['total_id_ucount'] = total_id_ucount
|
||||
|
||||
#==========================================
|
||||
# Add column: mutation count in all samples
|
||||
#==========================================
|
||||
gene_LF2['mut_id_ucount'] = gene_LF2['mutationinformation'].map(gene_LF2.groupby('mutationinformation')['id'].nunique())
|
||||
gene_LF2['mut_id_ucount']
|
||||
|
||||
#=================
|
||||
# Add column: AF
|
||||
#=================
|
||||
gene_LF2['maf'] = gene_LF2['mut_id_ucount']/gene_LF2['total_id_ucount']
|
||||
gene_LF2['maf'].head()
|
||||
|
||||
#%% Mapping 1: column '<drug>', mutation_info
|
||||
#=======================
|
||||
# column name: <drug>
|
||||
#=======================
|
||||
# mapping 1: labels
|
||||
# mapping 1.1: labels
|
||||
dm_om_label_map = {dr_muts_col: 'DM'
|
||||
, other_muts_col: 'OM'}
|
||||
dm_om_label_map
|
||||
gene_LF2['mutation_info_labels'] = gene_LF2['mutation_info'].map(dm_om_label_map)
|
||||
|
||||
gene_LF0['mutation_info_labels'] = gene_LF0['mutation_info'].map(dm_om_label_map)
|
||||
|
||||
# mapping 2: numeric
|
||||
# mapping 1.2: numeric
|
||||
dm_om_num_map = {dr_muts_col: 1
|
||||
, other_muts_col: 0}
|
||||
|
||||
gene_LF0['dm_om_numeric'] = gene_LF0['mutation_info'].map(dm_om_num_map)
|
||||
gene_LF2['dm_om_numeric'] = gene_LF2['mutation_info'].map(dm_om_num_map)
|
||||
gene_LF2['dm_om_numeric_orig'] = gene_LF2['mutation_info_orig'].map(dm_om_num_map)
|
||||
|
||||
gene_LF0['mutation_info'].value_counts()
|
||||
gene_LF0['mutation_info_labels'].value_counts()
|
||||
gene_LF0['dm_om_numeric'].value_counts()
|
||||
gene_LF2['mutation_info'].value_counts()
|
||||
gene_LF2['mutation_info_labels'].value_counts()
|
||||
gene_LF2['dm_om_numeric'].value_counts()
|
||||
gene_LF2['dm_om_numeric_orig'].value_counts()
|
||||
|
||||
#%% Mapping 2: column '<drtype>', mutation
|
||||
#============================
|
||||
# column name: <drtype>
|
||||
#============================
|
||||
gene_LF0['drtype'].value_counts()
|
||||
gene_LF2['drtype'].value_counts()
|
||||
|
||||
# mapping: numeric
|
||||
# mapping 2.1: numeric
|
||||
drtype_map = {'XDR': 5
|
||||
, 'Pre-XDR': 4
|
||||
, 'MDR': 3
|
||||
, 'Pre-MDR': 2
|
||||
, 'Other': 1
|
||||
, 'Sensitive': 0}
|
||||
gene_LF0['drtype_numeric'] = gene_LF0['drtype'].map(drtype_map)
|
||||
|
||||
gene_LF0['drtype'].value_counts()
|
||||
gene_LF0['drtype_numeric'].value_counts()
|
||||
gene_LF2['drtype_numeric'] = gene_LF2['drtype'].map(drtype_map)
|
||||
|
||||
#%% multimode
|
||||
# COPY dst column
|
||||
gene_LF0['dst'] = gene_LF0[drug] # to allow cross checking
|
||||
gene_LF0['dst_multimode'] = gene_LF0[drug]
|
||||
gene_LF2['drtype'].value_counts()
|
||||
gene_LF2['drtype_numeric'].value_counts()
|
||||
|
||||
#%% Recalculations: Multimode
|
||||
# copy dst column
|
||||
gene_LF2['dst'] = gene_LF2[drug] # to allow cross checking
|
||||
gene_LF2['dst'].equals(gene_LF2[drug])
|
||||
|
||||
gene_LF2['dst_multimode'] = gene_LF2[drug]
|
||||
|
||||
# sanity check
|
||||
gene_LF0[drug].value_counts()
|
||||
gene_LF0['dst_multimode'].value_counts()
|
||||
gene_LF0[drug].isna().sum()
|
||||
gene_LF2[drug].value_counts()
|
||||
gene_LF2['dst_multimode'].value_counts()
|
||||
|
||||
if gene_LF0[drug].value_counts().sum()+gene_LF0[drug].isna().sum() == len(gene_LF0):
|
||||
print('\nPASS:', 'Numbers match')
|
||||
else:
|
||||
print('\nFAIL:', 'Numbers mismatch')
|
||||
gene_LF2[drug].isnull().sum()
|
||||
gene_LF2['dst_multimode'].isnull().sum()
|
||||
|
||||
gene_LF2['mutationinformation'].value_counts()
|
||||
gene_LF2[drug].isnull().groupby(gene_LF2['mutationinformation']).sum()
|
||||
|
||||
gene_LF0['mutation'].value_counts()
|
||||
#data.C.isnull().groupby([df['A'],df['B']]).sum().astype(int).reset_index(name='count')
|
||||
gene_LF0[drug].isnull().groupby(gene_LF0['mutation']).sum()
|
||||
|
||||
# GOAL is to populate na in the dst column from the count of the dm_om_numeric column
|
||||
gene_LF0['dst_multimode'].isnull().groupby(gene_LF0['mutationinformation']).sum()
|
||||
|
||||
# COPY mutationinformation for sanity check
|
||||
#data['mutation'] = data['mutationinformation']
|
||||
gene_LF0['mutationinformation'] = gene_LF0['mutation']
|
||||
|
||||
gene_LF2['dst_multimode'].isnull().groupby(gene_LF2['mutationinformation']).sum()
|
||||
gene_LF2.index
|
||||
gene_LF2['index_orig'] = gene_LF2.index # need it for setting back later
|
||||
#%% FIXME: Add sanity check
|
||||
#gene_LF2['index_orig'].equals(gene_LF2.index)
|
||||
#%% Set index: 'mutationinformation' for adding multimode
|
||||
gene_LF3 = gene_LF2.set_index(['mutationinformation'])
|
||||
gene_LF3.index
|
||||
gene_LF3['dst_multimode'].value_counts()
|
||||
gene_LF3['dst_multimode'].value_counts().sum()
|
||||
#%% Multimode: dst
|
||||
# For each mutation, generate the revised dst which is the mode of dm_om_numeric
|
||||
#=============================
|
||||
# Recalculation: Revised dst
|
||||
# max(multimode)
|
||||
#=============================
|
||||
# Get multimode for dm_om_numeric column
|
||||
dm_om_multimode = gene_LF0.groupby('mutationinformation')['dm_om_numeric'].agg(multimode)
|
||||
dm_om_multimode
|
||||
dm_om_multimode_LF3 = gene_LF3.groupby('mutationinformation')['dm_om_numeric_orig'].agg(multimode)
|
||||
dm_om_multimode_LF3
|
||||
|
||||
gene_LF0['dst_multimode'] = gene_LF0['dst_multimode'].fillna(dm_om_multimode)
|
||||
gene_LF0['dst_noNA'] = gene_LF0['dst_multimode'].apply(lambda x: np.nanmax(x))
|
||||
print(gene_LF0)
|
||||
# Fill using multimode ONLY where NA in dst_multimode column
|
||||
gene_LF3['dst_multimode'] = gene_LF3['dst_multimode'].fillna(dm_om_multimode_LF3)
|
||||
|
||||
# Now get the max from multimode
|
||||
gene_LF3['dst_noNA'] = gene_LF3['dst_multimode'].apply(lambda x: np.nanmax(x))
|
||||
print(gene_LF3)
|
||||
|
||||
#----------------------------
|
||||
# Revised dst column: Max
|
||||
#----------------------------
|
||||
# Finally created a revised dst with the max from the multimode
|
||||
gene_LF0['dst_mode'] = gene_LF0.groupby('mutationinformation')['dst_noNA'].max()
|
||||
gene_LF3['dst_mode'] = gene_LF3.groupby('mutationinformation')['dst_noNA'].max()
|
||||
|
||||
#%% Multimode: drtype
|
||||
#=============================
|
||||
# Recalculation: Revised drtype
|
||||
# max(multimode)
|
||||
#=============================
|
||||
#--------------------------------
|
||||
# drtype: ALL values:
|
||||
# numeric and names in an array
|
||||
#--------------------------------
|
||||
gene_LF3['drtype_all_vals'] = gene_LF3['drtype_numeric']
|
||||
gene_LF3['drtype_all_names'] = gene_LF3['drtype']
|
||||
|
||||
gene_LF3['drtype_all_vals'] = gene_LF3.groupby('mutationinformation').drtype_all_vals.apply(list)
|
||||
gene_LF3['drtype_all_names'] = gene_LF3.groupby('mutationinformation').drtype_all_names.apply(list)
|
||||
|
||||
#---------------------------------
|
||||
# Revised drtype: max(Multimode)
|
||||
#--------------------------------
|
||||
gene_LF3['drtype_multimode'] = gene_LF3.groupby(['mutationinformation'])['drtype_numeric'].agg(multimode)
|
||||
gene_LF3['drtype_multimode']
|
||||
|
||||
# Now get the max from multimode
|
||||
gene_LF3['drtype_mode'] = gene_LF3['drtype_multimode'].apply(lambda x: np.nanmax(x))
|
||||
gene_LF3.head()
|
||||
|
||||
#----------------------
|
||||
# Revised drtype: Max
|
||||
#----------------------
|
||||
gene_LF3.head()
|
||||
gene_LF3['drtype_max'] = gene_LF3.groupby(['mutationinformation'])['drtype_numeric'].max()
|
||||
gene_LF3.head()
|
||||
|
||||
#%% Reset index: original indices
|
||||
#gene_LF3 = gene_LF3.reset_index()
|
||||
gene_LF3.index
|
||||
gene_LF3['mutationinformation'] = gene_LF3.index
|
||||
gene_LF3 = gene_LF3.set_index(['index_orig'])
|
||||
|
||||
gene_LF3[['mutationinformation']]
|
||||
gene_LF3.index
|
||||
#%% Revised counts
|
||||
gene_LF3['dst_mode'].value_counts()
|
||||
gene_LF3[drug].value_counts()
|
||||
|
||||
print('\n------------------------------------------------------'
|
||||
, '\nRevised counting: mutation_info i.e dm om column'
|
||||
, '\n-----------------------------------------------------'
|
||||
|
||||
, '\n----------------------------------'
|
||||
, '\nOriginal drug column count'
|
||||
, '\n----------------------------------'
|
||||
, gene_LF3[drug].value_counts()
|
||||
, '\nTotal samples [original]:', gene_LF3[drug].value_counts().sum()
|
||||
|
||||
, '\n----------------------------------'
|
||||
, '\nRevised drug column count'
|
||||
, '\n----------------------------------'
|
||||
, gene_LF3['dst_mode'].value_counts()
|
||||
, '\nTotal samples [revised]:', gene_LF3['dst_mode'].value_counts().sum()
|
||||
)
|
||||
#%% FIXME: CHECK THIS, run this with mutation_info_REV
|
||||
#---------------------------------------
|
||||
# Create revised mutation_info_column
|
||||
#---------------------------------------
|
||||
# Note this is overriding, since downstream depends on it
|
||||
# make a copy you if you need to keep that
|
||||
# create a copy before running this
|
||||
|
||||
gene_LF3['mutation_info_labels_orig'] = gene_LF3['mutation_info_labels']
|
||||
|
||||
gene_LF3['mutation_info_labels'] = gene_LF3['dst_mode'].map({1: 'DM'
|
||||
, 0: 'OM'})
|
||||
|
||||
gene_LF3['mutation_info_labels_orig'].value_counts()
|
||||
gene_LF3['mutation_info_labels_orig'].value_counts().sum()
|
||||
|
||||
gene_LF3['mutation_info_labels'].value_counts()
|
||||
gene_LF3['mutation_info_labels'].value_counts().sum()
|
||||
#%% TEST muts
|
||||
test_muts = ['G132A', 'V180F', 'G108R', 'A102P']
|
||||
test_muts = ['L4S', 'L4W', 'A102P']
|
||||
|
||||
gene_LF3 = gene_LF2[gene_LF2.loc[:,'mutationinformation'].isin(test_muts)]
|
||||
gene_LF4 = gene_LF3[['id', 'mutationinformation', drug
|
||||
, 'mutation_info_orig', 'dm_om_numeric_orig', 'dst', 'dst_multimode']]
|
||||
|
||||
# Reset index as it allows the groupby expression to directly map
|
||||
gene_LF4.index
|
||||
gene_LF4= gene_LF4.set_index(['mutationinformation'])
|
||||
gene_LF4.index
|
||||
|
||||
# Get multimode for dm_om_numeric column
|
||||
dm_om_multimode_LF4 = gene_LF4.groupby('mutationinformation')['dm_om_numeric_orig'].agg(multimode)
|
||||
dm_om_multimode_LF4
|
||||
|
||||
gene_LF4['dst_multimode'] = gene_LF4['dst_multimode'].fillna(dm_om_multimode_LF4)
|
||||
|
||||
# LINEAGE
|
||||
foo = gene_LF2.copy()
|
||||
foo = foo[foo.loc[:,'mutationinformation'].isin(test_muts)]
|
||||
foo = foo[['id', 'mutationinformation','lineage' ]]
|
||||
foo['MUT'] = foo['mutationinformation']
|
||||
foo['lineage'] = foo['lineage'].str.strip()
|
||||
foo['lineage_corrupt'] = foo['lineage']
|
||||
|
||||
#foo['lineage_ucount'] = foo['mutationinformation'].map(foo.groupby('mutationinformation')['lineage'].nunique())# seems wrong!
|
||||
|
||||
foo2 = tidy_split(foo, 'lineage_corrupt', sep = ';')
|
||||
foo2['lineage_corrupt'] = foo2['lineage_corrupt'].str.strip()
|
||||
foo2['lineage_corrupt_list'] = foo2['mutationinformation'].map(foo2.groupby('mutationinformation')['lineage_corrupt'].apply(list))
|
||||
foo2['lineage_corrupt_ucount'] = foo2['mutationinformation'].map(foo2.groupby('mutationinformation')['lineage_corrupt'].nunique())
|
||||
|
||||
foo2.groupby('mutationinformation')['lineage_corrupt'].value_counts()
|
||||
foo2['lineage_corrupt'].value_counts()
|
||||
foo2['lineage_corrupt_ucount']
|
||||
foo2.index
|
||||
foo2 = foo2.set_index(['mutationinformation'])
|
||||
|
||||
|
||||
# now merge
|
||||
foo.index
|
||||
foo.index.nunique()
|
||||
foo2.index.nunique()
|
||||
foo_copy = foo.copy()
|
||||
|
||||
foo_copy['lineage_ucount'] = foo_copy['lineage']
|
||||
foo_copy.loc[foo2.index, 'lineage_ucount'] = foo2['lineage_corrupt_ucount']
|
||||
|
||||
#%%FIXME: do regex for lineage for meta data else the ; messes it up
|
||||
#--------------------------
|
||||
# lineage multimode mode
|
||||
#--------------------------
|
||||
lineage_label_map = {'lineage1' : 'L1'
|
||||
, 'lineage2' : 'L2'
|
||||
, 'lineage3' : 'L3'
|
||||
, 'lineage4' : 'L4'
|
||||
, 'lineage5' : 'L5'
|
||||
, 'lineage6' : 'L6'
|
||||
, 'lineage7' : 'L7'
|
||||
, 'lineageBOV' : 'LBOV'}
|
||||
|
||||
foo['lineage'].value_counts()
|
||||
|
||||
lineage_label_numeric = {'lineage1' : 1
|
||||
, 'lineage2' : 2
|
||||
, 'lineage3' : 3
|
||||
, 'lineage4' : 4
|
||||
, 'lineage5' : 5
|
||||
, 'lineage6' : 6
|
||||
, 'lineage7' : 7
|
||||
, 'lineageBOV' : 8}
|
||||
|
||||
|
||||
lineage_label_numeric
|
||||
foo2['lineage_corrupt'].value_counts()
|
||||
foo2['lineage_numeric'] = foo2['lineage_corrupt'].map(lineage_label_numeric)
|
||||
foo2['lineage_numeric'].value_counts()
|
||||
|
||||
foo2['lineage_numeric_list'] = foo2['mutationinformation'].map(foo2.groupby('mutationinformation')['lineage_numeric'].apply(list))
|
||||
foo2['lineage_numeric_list']
|
||||
|
||||
foo2['lineage_multimode'] = foo2.groupby(['mutationinformation'])['lineage_numeric'].agg(multimode)
|
||||
c2 = foo2[foo2.loc[:, 'MUT'].isin(['A102P'])]
|
||||
c2['lineage_numeric'].value_counts()
|
||||
|
||||
|
||||
#%% Lineage counts (including the ones containing multiple entries)
|
||||
|
||||
# Get information about how many distinct lineages each mutation comes from
|
||||
gene_LF3['lineage'].value_counts()
|
||||
gene_LF3['lineage'] = gene_LF3['lineage'].str.strip()
|
||||
gene_LF3['lineage'].value_counts()
|
||||
|
||||
# Create a column: lineage_corrupt
|
||||
gene_LF3['lineage_corrupt'] = gene_LF3['lineage']
|
||||
|
||||
# Create df with tidy_split: lineage
|
||||
lf_lin_split = tidy_split(gene_LF3, 'lineage_corrupt', sep = ';')
|
||||
lf_lin_split['lineage_corrupt'] = lf_lin_split['lineage_corrupt'].str.strip()
|
||||
lf_lin_split['lineage_corrupt_list'] = lf_lin_split['mutationinformation'].map(lf_lin_split.groupby('mutationinformation')['lineage_corrupt'].apply(list))
|
||||
lf_lin_split['lineage_corrupt_ucount'] = lf_lin_split['mutationinformation'].map(lf_lin_split.groupby('mutationinformation')['lineage_corrupt'].nunique())
|
||||
|
||||
#----------------------------------
|
||||
# Merge with gene_LF3 with
|
||||
# lf_lin_split
|
||||
#-----------------------------------
|
||||
gene_LF3['lineage_ucount'] = gene_LF3['lineage']
|
||||
|
||||
# quick checks
|
||||
gene_LF3['lineage_ucount'].equals(gene_LF3['lineage'])
|
||||
|
||||
# merge based on indices
|
||||
gene_LF3.index.nunique()
|
||||
lf_lin_split.index.nunique()
|
||||
all(gene_LF3.index.isin(lf_lin_split.index))
|
||||
all(lf_lin_split.index.isin(gene_LF3.index))
|
||||
|
||||
# magic merge happens here
|
||||
gene_LF3.loc[lf_lin_split.index, 'lineage_ucount'] = lf_lin_split['lineage_corrupt_ucount']
|
||||
|
||||
#%% sanity checks
|
||||
check1 = gene_LF3[['mutationinformation', 'lineage', 'lineage_ucount']]
|
||||
check2 = check1[check1.loc[:, 'mutationinformation'].isin(['H57D'])]
|
||||
check2.value_counts()
|
||||
|
||||
#%%
|
Loading…
Add table
Add a link
Reference in a new issue