mostly done, now adding lineage magicry
This commit is contained in:
parent
f05cb96346
commit
ae3a5500c9
1 changed files with 302 additions and 42 deletions
|
@ -1440,79 +1440,339 @@ else:
|
||||||
|
|
||||||
# clear variables
|
# clear variables
|
||||||
del(k, v, wt, mut, lookup_dict)
|
del(k, v, wt, mut, lookup_dict)
|
||||||
|
#%% NEW mappings: gene_LF2
|
||||||
|
# gene_LF2: copy gene_LF1
|
||||||
|
gene_LF2 = gene_LF1.copy()
|
||||||
|
|
||||||
#%% NEW TODO: map mutationinformation
|
#%% AF for gene
|
||||||
|
gene_LF2['id'].nunique()
|
||||||
|
gene_LF2['mutationinformation'].nunique()
|
||||||
|
total_id_ucount = gene_LF2['id'].nunique()
|
||||||
|
total_id_ucount
|
||||||
|
total_id_ucount2 = gene_LF2['sample'].nunique()
|
||||||
|
total_id_ucount2
|
||||||
|
|
||||||
#%% NEW: mappings
|
if total_id_ucount == total_id_ucount2:
|
||||||
|
print('\nPASS: sample and id unique counts match')
|
||||||
|
else:
|
||||||
|
print('\nFAIL: sample and id unique counts DO NOT match!'
|
||||||
|
, '\nGWAS worry!?')
|
||||||
|
|
||||||
|
# Add all sample ids in a list for sanity checks
|
||||||
|
#gene_LF2['id_list'] = gene_LF2['mutationinformation'].map(gene_LF2.groupby('mutationinformation')['id'].apply(list))
|
||||||
|
|
||||||
|
#=======================================
|
||||||
|
# Add column: total unique id/sample count
|
||||||
|
#=======================================
|
||||||
|
gene_LF2['total_id_ucount'] = total_id_ucount
|
||||||
|
|
||||||
|
#==========================================
|
||||||
|
# Add column: mutation count in all samples
|
||||||
|
#==========================================
|
||||||
|
gene_LF2['mut_id_ucount'] = gene_LF2['mutationinformation'].map(gene_LF2.groupby('mutationinformation')['id'].nunique())
|
||||||
|
gene_LF2['mut_id_ucount']
|
||||||
|
|
||||||
|
#=================
|
||||||
|
# Add column: AF
|
||||||
|
#=================
|
||||||
|
gene_LF2['maf'] = gene_LF2['mut_id_ucount']/gene_LF2['total_id_ucount']
|
||||||
|
gene_LF2['maf'].head()
|
||||||
|
|
||||||
|
#%% Mapping 1: column '<drug>', mutation_info
|
||||||
#=======================
|
#=======================
|
||||||
# column name: <drug>
|
# column name: <drug>
|
||||||
#=======================
|
#=======================
|
||||||
# mapping 1: labels
|
# mapping 1.1: labels
|
||||||
dm_om_label_map = {dr_muts_col: 'DM'
|
dm_om_label_map = {dr_muts_col: 'DM'
|
||||||
, other_muts_col: 'OM'}
|
, other_muts_col: 'OM'}
|
||||||
|
dm_om_label_map
|
||||||
|
gene_LF2['mutation_info_labels'] = gene_LF2['mutation_info'].map(dm_om_label_map)
|
||||||
|
|
||||||
gene_LF0['mutation_info_labels'] = gene_LF0['mutation_info'].map(dm_om_label_map)
|
# mapping 1.2: numeric
|
||||||
|
|
||||||
# mapping 2: numeric
|
|
||||||
dm_om_num_map = {dr_muts_col: 1
|
dm_om_num_map = {dr_muts_col: 1
|
||||||
, other_muts_col: 0}
|
, other_muts_col: 0}
|
||||||
|
|
||||||
gene_LF0['dm_om_numeric'] = gene_LF0['mutation_info'].map(dm_om_num_map)
|
gene_LF2['dm_om_numeric'] = gene_LF2['mutation_info'].map(dm_om_num_map)
|
||||||
|
gene_LF2['dm_om_numeric_orig'] = gene_LF2['mutation_info_orig'].map(dm_om_num_map)
|
||||||
|
|
||||||
gene_LF0['mutation_info'].value_counts()
|
gene_LF2['mutation_info'].value_counts()
|
||||||
gene_LF0['mutation_info_labels'].value_counts()
|
gene_LF2['mutation_info_labels'].value_counts()
|
||||||
gene_LF0['dm_om_numeric'].value_counts()
|
gene_LF2['dm_om_numeric'].value_counts()
|
||||||
|
gene_LF2['dm_om_numeric_orig'].value_counts()
|
||||||
|
|
||||||
|
#%% Mapping 2: column '<drtype>', mutation
|
||||||
#============================
|
#============================
|
||||||
# column name: <drtype>
|
# column name: <drtype>
|
||||||
#============================
|
#============================
|
||||||
gene_LF0['drtype'].value_counts()
|
gene_LF2['drtype'].value_counts()
|
||||||
|
|
||||||
# mapping: numeric
|
# mapping 2.1: numeric
|
||||||
drtype_map = {'XDR': 5
|
drtype_map = {'XDR': 5
|
||||||
, 'Pre-XDR': 4
|
, 'Pre-XDR': 4
|
||||||
, 'MDR': 3
|
, 'MDR': 3
|
||||||
, 'Pre-MDR': 2
|
, 'Pre-MDR': 2
|
||||||
, 'Other': 1
|
, 'Other': 1
|
||||||
, 'Sensitive': 0}
|
, 'Sensitive': 0}
|
||||||
gene_LF0['drtype_numeric'] = gene_LF0['drtype'].map(drtype_map)
|
|
||||||
|
|
||||||
gene_LF0['drtype'].value_counts()
|
gene_LF2['drtype_numeric'] = gene_LF2['drtype'].map(drtype_map)
|
||||||
gene_LF0['drtype_numeric'].value_counts()
|
|
||||||
|
|
||||||
#%% multimode
|
gene_LF2['drtype'].value_counts()
|
||||||
# COPY dst column
|
gene_LF2['drtype_numeric'].value_counts()
|
||||||
gene_LF0['dst'] = gene_LF0[drug] # to allow cross checking
|
|
||||||
gene_LF0['dst_multimode'] = gene_LF0[drug]
|
#%% Recalculations: Multimode
|
||||||
|
# copy dst column
|
||||||
|
gene_LF2['dst'] = gene_LF2[drug] # to allow cross checking
|
||||||
|
gene_LF2['dst'].equals(gene_LF2[drug])
|
||||||
|
|
||||||
|
gene_LF2['dst_multimode'] = gene_LF2[drug]
|
||||||
|
|
||||||
# sanity check
|
# sanity check
|
||||||
gene_LF0[drug].value_counts()
|
gene_LF2[drug].value_counts()
|
||||||
gene_LF0['dst_multimode'].value_counts()
|
gene_LF2['dst_multimode'].value_counts()
|
||||||
gene_LF0[drug].isna().sum()
|
|
||||||
|
|
||||||
if gene_LF0[drug].value_counts().sum()+gene_LF0[drug].isna().sum() == len(gene_LF0):
|
gene_LF2[drug].isnull().sum()
|
||||||
print('\nPASS:', 'Numbers match')
|
gene_LF2['dst_multimode'].isnull().sum()
|
||||||
else:
|
|
||||||
print('\nFAIL:', 'Numbers mismatch')
|
|
||||||
|
|
||||||
gene_LF0['mutation'].value_counts()
|
gene_LF2['mutationinformation'].value_counts()
|
||||||
#data.C.isnull().groupby([df['A'],df['B']]).sum().astype(int).reset_index(name='count')
|
gene_LF2[drug].isnull().groupby(gene_LF2['mutationinformation']).sum()
|
||||||
gene_LF0[drug].isnull().groupby(gene_LF0['mutation']).sum()
|
|
||||||
|
|
||||||
# GOAL is to populate na in the dst column from the count of the dm_om_numeric column
|
# GOAL is to populate na in the dst column from the count of the dm_om_numeric column
|
||||||
gene_LF0['dst_multimode'].isnull().groupby(gene_LF0['mutationinformation']).sum()
|
gene_LF2['dst_multimode'].isnull().groupby(gene_LF2['mutationinformation']).sum()
|
||||||
|
gene_LF2.index
|
||||||
|
gene_LF2['index_orig'] = gene_LF2.index # need it for setting back later
|
||||||
|
#%% FIXME: Add sanity check
|
||||||
|
#gene_LF2['index_orig'].equals(gene_LF2.index)
|
||||||
|
#%% Set index: 'mutationinformation' for adding multimode
|
||||||
|
gene_LF3 = gene_LF2.set_index(['mutationinformation'])
|
||||||
|
gene_LF3.index
|
||||||
|
gene_LF3['dst_multimode'].value_counts()
|
||||||
|
gene_LF3['dst_multimode'].value_counts().sum()
|
||||||
|
#%% Multimode: dst
|
||||||
|
# For each mutation, generate the revised dst which is the mode of dm_om_numeric
|
||||||
|
#=============================
|
||||||
|
# Recalculation: Revised dst
|
||||||
|
# max(multimode)
|
||||||
|
#=============================
|
||||||
|
# Get multimode for dm_om_numeric column
|
||||||
|
dm_om_multimode_LF3 = gene_LF3.groupby('mutationinformation')['dm_om_numeric_orig'].agg(multimode)
|
||||||
|
dm_om_multimode_LF3
|
||||||
|
|
||||||
# COPY mutationinformation for sanity check
|
# Fill using multimode ONLY where NA in dst_multimode column
|
||||||
#data['mutation'] = data['mutationinformation']
|
gene_LF3['dst_multimode'] = gene_LF3['dst_multimode'].fillna(dm_om_multimode_LF3)
|
||||||
gene_LF0['mutationinformation'] = gene_LF0['mutation']
|
|
||||||
|
# Now get the max from multimode
|
||||||
|
gene_LF3['dst_noNA'] = gene_LF3['dst_multimode'].apply(lambda x: np.nanmax(x))
|
||||||
|
print(gene_LF3)
|
||||||
|
|
||||||
|
#----------------------------
|
||||||
|
# Revised dst column: Max
|
||||||
|
#----------------------------
|
||||||
|
# Finally created a revised dst with the max from the multimode
|
||||||
|
gene_LF3['dst_mode'] = gene_LF3.groupby('mutationinformation')['dst_noNA'].max()
|
||||||
|
|
||||||
|
#%% Multimode: drtype
|
||||||
|
#=============================
|
||||||
|
# Recalculation: Revised drtype
|
||||||
|
# max(multimode)
|
||||||
|
#=============================
|
||||||
|
#--------------------------------
|
||||||
|
# drtype: ALL values:
|
||||||
|
# numeric and names in an array
|
||||||
|
#--------------------------------
|
||||||
|
gene_LF3['drtype_all_vals'] = gene_LF3['drtype_numeric']
|
||||||
|
gene_LF3['drtype_all_names'] = gene_LF3['drtype']
|
||||||
|
|
||||||
|
gene_LF3['drtype_all_vals'] = gene_LF3.groupby('mutationinformation').drtype_all_vals.apply(list)
|
||||||
|
gene_LF3['drtype_all_names'] = gene_LF3.groupby('mutationinformation').drtype_all_names.apply(list)
|
||||||
|
|
||||||
|
#---------------------------------
|
||||||
|
# Revised drtype: max(Multimode)
|
||||||
|
#--------------------------------
|
||||||
|
gene_LF3['drtype_multimode'] = gene_LF3.groupby(['mutationinformation'])['drtype_numeric'].agg(multimode)
|
||||||
|
gene_LF3['drtype_multimode']
|
||||||
|
|
||||||
|
# Now get the max from multimode
|
||||||
|
gene_LF3['drtype_mode'] = gene_LF3['drtype_multimode'].apply(lambda x: np.nanmax(x))
|
||||||
|
gene_LF3.head()
|
||||||
|
|
||||||
|
#----------------------
|
||||||
|
# Revised drtype: Max
|
||||||
|
#----------------------
|
||||||
|
gene_LF3.head()
|
||||||
|
gene_LF3['drtype_max'] = gene_LF3.groupby(['mutationinformation'])['drtype_numeric'].max()
|
||||||
|
gene_LF3.head()
|
||||||
|
|
||||||
|
#%% Reset index: original indices
|
||||||
|
#gene_LF3 = gene_LF3.reset_index()
|
||||||
|
gene_LF3.index
|
||||||
|
gene_LF3['mutationinformation'] = gene_LF3.index
|
||||||
|
gene_LF3 = gene_LF3.set_index(['index_orig'])
|
||||||
|
|
||||||
|
gene_LF3[['mutationinformation']]
|
||||||
|
gene_LF3.index
|
||||||
|
#%% Revised counts
|
||||||
|
gene_LF3['dst_mode'].value_counts()
|
||||||
|
gene_LF3[drug].value_counts()
|
||||||
|
|
||||||
|
print('\n------------------------------------------------------'
|
||||||
|
, '\nRevised counting: mutation_info i.e dm om column'
|
||||||
|
, '\n-----------------------------------------------------'
|
||||||
|
|
||||||
|
, '\n----------------------------------'
|
||||||
|
, '\nOriginal drug column count'
|
||||||
|
, '\n----------------------------------'
|
||||||
|
, gene_LF3[drug].value_counts()
|
||||||
|
, '\nTotal samples [original]:', gene_LF3[drug].value_counts().sum()
|
||||||
|
|
||||||
|
, '\n----------------------------------'
|
||||||
|
, '\nRevised drug column count'
|
||||||
|
, '\n----------------------------------'
|
||||||
|
, gene_LF3['dst_mode'].value_counts()
|
||||||
|
, '\nTotal samples [revised]:', gene_LF3['dst_mode'].value_counts().sum()
|
||||||
|
)
|
||||||
|
#%% FIXME: CHECK THIS, run this with mutation_info_REV
|
||||||
|
#---------------------------------------
|
||||||
|
# Create revised mutation_info_column
|
||||||
|
#---------------------------------------
|
||||||
|
# Note this is overriding, since downstream depends on it
|
||||||
|
# make a copy you if you need to keep that
|
||||||
|
# create a copy before running this
|
||||||
|
|
||||||
|
gene_LF3['mutation_info_labels_orig'] = gene_LF3['mutation_info_labels']
|
||||||
|
|
||||||
|
gene_LF3['mutation_info_labels'] = gene_LF3['dst_mode'].map({1: 'DM'
|
||||||
|
, 0: 'OM'})
|
||||||
|
|
||||||
|
gene_LF3['mutation_info_labels_orig'].value_counts()
|
||||||
|
gene_LF3['mutation_info_labels_orig'].value_counts().sum()
|
||||||
|
|
||||||
|
gene_LF3['mutation_info_labels'].value_counts()
|
||||||
|
gene_LF3['mutation_info_labels'].value_counts().sum()
|
||||||
|
#%% TEST muts
|
||||||
|
test_muts = ['G132A', 'V180F', 'G108R', 'A102P']
|
||||||
|
test_muts = ['L4S', 'L4W', 'A102P']
|
||||||
|
|
||||||
|
gene_LF3 = gene_LF2[gene_LF2.loc[:,'mutationinformation'].isin(test_muts)]
|
||||||
|
gene_LF4 = gene_LF3[['id', 'mutationinformation', drug
|
||||||
|
, 'mutation_info_orig', 'dm_om_numeric_orig', 'dst', 'dst_multimode']]
|
||||||
|
|
||||||
|
# Reset index as it allows the groupby expression to directly map
|
||||||
|
gene_LF4.index
|
||||||
|
gene_LF4= gene_LF4.set_index(['mutationinformation'])
|
||||||
|
gene_LF4.index
|
||||||
|
|
||||||
# Get multimode for dm_om_numeric column
|
# Get multimode for dm_om_numeric column
|
||||||
dm_om_multimode = gene_LF0.groupby('mutationinformation')['dm_om_numeric'].agg(multimode)
|
dm_om_multimode_LF4 = gene_LF4.groupby('mutationinformation')['dm_om_numeric_orig'].agg(multimode)
|
||||||
dm_om_multimode
|
dm_om_multimode_LF4
|
||||||
|
|
||||||
gene_LF0['dst_multimode'] = gene_LF0['dst_multimode'].fillna(dm_om_multimode)
|
gene_LF4['dst_multimode'] = gene_LF4['dst_multimode'].fillna(dm_om_multimode_LF4)
|
||||||
gene_LF0['dst_noNA'] = gene_LF0['dst_multimode'].apply(lambda x: np.nanmax(x))
|
|
||||||
print(gene_LF0)
|
|
||||||
|
|
||||||
# Finally created a revised dst with the max from the multimode
|
# LINEAGE
|
||||||
gene_LF0['dst_mode'] = gene_LF0.groupby('mutationinformation')['dst_noNA'].max()
|
foo = gene_LF2.copy()
|
||||||
|
foo = foo[foo.loc[:,'mutationinformation'].isin(test_muts)]
|
||||||
|
foo = foo[['id', 'mutationinformation','lineage' ]]
|
||||||
|
foo['MUT'] = foo['mutationinformation']
|
||||||
|
foo['lineage'] = foo['lineage'].str.strip()
|
||||||
|
foo['lineage_corrupt'] = foo['lineage']
|
||||||
|
|
||||||
|
#foo['lineage_ucount'] = foo['mutationinformation'].map(foo.groupby('mutationinformation')['lineage'].nunique())# seems wrong!
|
||||||
|
|
||||||
|
foo2 = tidy_split(foo, 'lineage_corrupt', sep = ';')
|
||||||
|
foo2['lineage_corrupt'] = foo2['lineage_corrupt'].str.strip()
|
||||||
|
foo2['lineage_corrupt_list'] = foo2['mutationinformation'].map(foo2.groupby('mutationinformation')['lineage_corrupt'].apply(list))
|
||||||
|
foo2['lineage_corrupt_ucount'] = foo2['mutationinformation'].map(foo2.groupby('mutationinformation')['lineage_corrupt'].nunique())
|
||||||
|
|
||||||
|
foo2.groupby('mutationinformation')['lineage_corrupt'].value_counts()
|
||||||
|
foo2['lineage_corrupt'].value_counts()
|
||||||
|
foo2['lineage_corrupt_ucount']
|
||||||
|
foo2.index
|
||||||
|
foo2 = foo2.set_index(['mutationinformation'])
|
||||||
|
|
||||||
|
|
||||||
|
# now merge
|
||||||
|
foo.index
|
||||||
|
foo.index.nunique()
|
||||||
|
foo2.index.nunique()
|
||||||
|
foo_copy = foo.copy()
|
||||||
|
|
||||||
|
foo_copy['lineage_ucount'] = foo_copy['lineage']
|
||||||
|
foo_copy.loc[foo2.index, 'lineage_ucount'] = foo2['lineage_corrupt_ucount']
|
||||||
|
|
||||||
|
#%%FIXME: do regex for lineage for meta data else the ; messes it up
|
||||||
|
#--------------------------
|
||||||
|
# lineage multimode mode
|
||||||
|
#--------------------------
|
||||||
|
lineage_label_map = {'lineage1' : 'L1'
|
||||||
|
, 'lineage2' : 'L2'
|
||||||
|
, 'lineage3' : 'L3'
|
||||||
|
, 'lineage4' : 'L4'
|
||||||
|
, 'lineage5' : 'L5'
|
||||||
|
, 'lineage6' : 'L6'
|
||||||
|
, 'lineage7' : 'L7'
|
||||||
|
, 'lineageBOV' : 'LBOV'}
|
||||||
|
|
||||||
|
foo['lineage'].value_counts()
|
||||||
|
|
||||||
|
lineage_label_numeric = {'lineage1' : 1
|
||||||
|
, 'lineage2' : 2
|
||||||
|
, 'lineage3' : 3
|
||||||
|
, 'lineage4' : 4
|
||||||
|
, 'lineage5' : 5
|
||||||
|
, 'lineage6' : 6
|
||||||
|
, 'lineage7' : 7
|
||||||
|
, 'lineageBOV' : 8}
|
||||||
|
|
||||||
|
|
||||||
|
lineage_label_numeric
|
||||||
|
foo2['lineage_corrupt'].value_counts()
|
||||||
|
foo2['lineage_numeric'] = foo2['lineage_corrupt'].map(lineage_label_numeric)
|
||||||
|
foo2['lineage_numeric'].value_counts()
|
||||||
|
|
||||||
|
foo2['lineage_numeric_list'] = foo2['mutationinformation'].map(foo2.groupby('mutationinformation')['lineage_numeric'].apply(list))
|
||||||
|
foo2['lineage_numeric_list']
|
||||||
|
|
||||||
|
foo2['lineage_multimode'] = foo2.groupby(['mutationinformation'])['lineage_numeric'].agg(multimode)
|
||||||
|
c2 = foo2[foo2.loc[:, 'MUT'].isin(['A102P'])]
|
||||||
|
c2['lineage_numeric'].value_counts()
|
||||||
|
|
||||||
|
|
||||||
|
#%% Lineage counts (including the ones containing multiple entries)
|
||||||
|
|
||||||
|
# Get information about how many distinct lineages each mutation comes from
|
||||||
|
gene_LF3['lineage'].value_counts()
|
||||||
|
gene_LF3['lineage'] = gene_LF3['lineage'].str.strip()
|
||||||
|
gene_LF3['lineage'].value_counts()
|
||||||
|
|
||||||
|
# Create a column: lineage_corrupt
|
||||||
|
gene_LF3['lineage_corrupt'] = gene_LF3['lineage']
|
||||||
|
|
||||||
|
# Create df with tidy_split: lineage
|
||||||
|
lf_lin_split = tidy_split(gene_LF3, 'lineage_corrupt', sep = ';')
|
||||||
|
lf_lin_split['lineage_corrupt'] = lf_lin_split['lineage_corrupt'].str.strip()
|
||||||
|
lf_lin_split['lineage_corrupt_list'] = lf_lin_split['mutationinformation'].map(lf_lin_split.groupby('mutationinformation')['lineage_corrupt'].apply(list))
|
||||||
|
lf_lin_split['lineage_corrupt_ucount'] = lf_lin_split['mutationinformation'].map(lf_lin_split.groupby('mutationinformation')['lineage_corrupt'].nunique())
|
||||||
|
|
||||||
|
#----------------------------------
|
||||||
|
# Merge with gene_LF3 with
|
||||||
|
# lf_lin_split
|
||||||
|
#-----------------------------------
|
||||||
|
gene_LF3['lineage_ucount'] = gene_LF3['lineage']
|
||||||
|
|
||||||
|
# quick checks
|
||||||
|
gene_LF3['lineage_ucount'].equals(gene_LF3['lineage'])
|
||||||
|
|
||||||
|
# merge based on indices
|
||||||
|
gene_LF3.index.nunique()
|
||||||
|
lf_lin_split.index.nunique()
|
||||||
|
all(gene_LF3.index.isin(lf_lin_split.index))
|
||||||
|
all(lf_lin_split.index.isin(gene_LF3.index))
|
||||||
|
|
||||||
|
# magic merge happens here
|
||||||
|
gene_LF3.loc[lf_lin_split.index, 'lineage_ucount'] = lf_lin_split['lineage_corrupt_ucount']
|
||||||
|
|
||||||
|
#%% sanity checks
|
||||||
|
check1 = gene_LF3[['mutationinformation', 'lineage', 'lineage_ucount']]
|
||||||
|
check2 = check1[check1.loc[:, 'mutationinformation'].isin(['H57D'])]
|
||||||
|
check2.value_counts()
|
||||||
|
|
||||||
|
#%%
|
Loading…
Add table
Add a link
Reference in a new issue