mostly done, now adding lineage magicry

This commit is contained in:
Tanushree Tunstall 2022-04-14 19:27:21 +01:00
parent f05cb96346
commit ae3a5500c9

View file

@ -1440,79 +1440,339 @@ else:
# clear variables # clear variables
del(k, v, wt, mut, lookup_dict) del(k, v, wt, mut, lookup_dict)
#%% NEW mappings: gene_LF2
# gene_LF2: copy gene_LF1
gene_LF2 = gene_LF1.copy()
#%% NEW TODO: map mutationinformation #%% AF for gene
gene_LF2['id'].nunique()
gene_LF2['mutationinformation'].nunique()
total_id_ucount = gene_LF2['id'].nunique()
total_id_ucount
total_id_ucount2 = gene_LF2['sample'].nunique()
total_id_ucount2
#%% NEW: mappings if total_id_ucount == total_id_ucount2:
print('\nPASS: sample and id unique counts match')
else:
print('\nFAIL: sample and id unique counts DO NOT match!'
, '\nGWAS worry!?')
# Add all sample ids in a list for sanity checks
#gene_LF2['id_list'] = gene_LF2['mutationinformation'].map(gene_LF2.groupby('mutationinformation')['id'].apply(list))
#=======================================
# Add column: total unique id/sample count
#=======================================
gene_LF2['total_id_ucount'] = total_id_ucount
#==========================================
# Add column: mutation count in all samples
#==========================================
gene_LF2['mut_id_ucount'] = gene_LF2['mutationinformation'].map(gene_LF2.groupby('mutationinformation')['id'].nunique())
gene_LF2['mut_id_ucount']
#=================
# Add column: AF
#=================
gene_LF2['maf'] = gene_LF2['mut_id_ucount']/gene_LF2['total_id_ucount']
gene_LF2['maf'].head()
#%% Mapping 1: column '<drug>', mutation_info
#======================= #=======================
# column name: <drug> # column name: <drug>
#======================= #=======================
# mapping 1: labels # mapping 1.1: labels
dm_om_label_map = {dr_muts_col: 'DM' dm_om_label_map = {dr_muts_col: 'DM'
, other_muts_col: 'OM'} , other_muts_col: 'OM'}
dm_om_label_map
gene_LF2['mutation_info_labels'] = gene_LF2['mutation_info'].map(dm_om_label_map)
gene_LF0['mutation_info_labels'] = gene_LF0['mutation_info'].map(dm_om_label_map) # mapping 1.2: numeric
# mapping 2: numeric
dm_om_num_map = {dr_muts_col: 1 dm_om_num_map = {dr_muts_col: 1
, other_muts_col: 0} , other_muts_col: 0}
gene_LF0['dm_om_numeric'] = gene_LF0['mutation_info'].map(dm_om_num_map) gene_LF2['dm_om_numeric'] = gene_LF2['mutation_info'].map(dm_om_num_map)
gene_LF2['dm_om_numeric_orig'] = gene_LF2['mutation_info_orig'].map(dm_om_num_map)
gene_LF0['mutation_info'].value_counts() gene_LF2['mutation_info'].value_counts()
gene_LF0['mutation_info_labels'].value_counts() gene_LF2['mutation_info_labels'].value_counts()
gene_LF0['dm_om_numeric'].value_counts() gene_LF2['dm_om_numeric'].value_counts()
gene_LF2['dm_om_numeric_orig'].value_counts()
#%% Mapping 2: column '<drtype>', mutation
#============================ #============================
# column name: <drtype> # column name: <drtype>
#============================ #============================
gene_LF0['drtype'].value_counts() gene_LF2['drtype'].value_counts()
# mapping: numeric # mapping 2.1: numeric
drtype_map = {'XDR': 5 drtype_map = {'XDR': 5
, 'Pre-XDR': 4 , 'Pre-XDR': 4
, 'MDR': 3 , 'MDR': 3
, 'Pre-MDR': 2 , 'Pre-MDR': 2
, 'Other': 1 , 'Other': 1
, 'Sensitive': 0} , 'Sensitive': 0}
gene_LF0['drtype_numeric'] = gene_LF0['drtype'].map(drtype_map)
gene_LF0['drtype'].value_counts() gene_LF2['drtype_numeric'] = gene_LF2['drtype'].map(drtype_map)
gene_LF0['drtype_numeric'].value_counts()
#%% multimode gene_LF2['drtype'].value_counts()
# COPY dst column gene_LF2['drtype_numeric'].value_counts()
gene_LF0['dst'] = gene_LF0[drug] # to allow cross checking
gene_LF0['dst_multimode'] = gene_LF0[drug] #%% Recalculations: Multimode
# copy dst column
gene_LF2['dst'] = gene_LF2[drug] # to allow cross checking
gene_LF2['dst'].equals(gene_LF2[drug])
gene_LF2['dst_multimode'] = gene_LF2[drug]
# sanity check # sanity check
gene_LF0[drug].value_counts() gene_LF2[drug].value_counts()
gene_LF0['dst_multimode'].value_counts() gene_LF2['dst_multimode'].value_counts()
gene_LF0[drug].isna().sum()
if gene_LF0[drug].value_counts().sum()+gene_LF0[drug].isna().sum() == len(gene_LF0): gene_LF2[drug].isnull().sum()
print('\nPASS:', 'Numbers match') gene_LF2['dst_multimode'].isnull().sum()
else:
print('\nFAIL:', 'Numbers mismatch')
gene_LF0['mutation'].value_counts() gene_LF2['mutationinformation'].value_counts()
#data.C.isnull().groupby([df['A'],df['B']]).sum().astype(int).reset_index(name='count') gene_LF2[drug].isnull().groupby(gene_LF2['mutationinformation']).sum()
gene_LF0[drug].isnull().groupby(gene_LF0['mutation']).sum()
# GOAL is to populate na in the dst column from the count of the dm_om_numeric column # GOAL is to populate na in the dst column from the count of the dm_om_numeric column
gene_LF0['dst_multimode'].isnull().groupby(gene_LF0['mutationinformation']).sum() gene_LF2['dst_multimode'].isnull().groupby(gene_LF2['mutationinformation']).sum()
gene_LF2.index
gene_LF2['index_orig'] = gene_LF2.index # need it for setting back later
#%% FIXME: Add sanity check
#gene_LF2['index_orig'].equals(gene_LF2.index)
#%% Set index: 'mutationinformation' for adding multimode
gene_LF3 = gene_LF2.set_index(['mutationinformation'])
gene_LF3.index
gene_LF3['dst_multimode'].value_counts()
gene_LF3['dst_multimode'].value_counts().sum()
#%% Multimode: dst
# For each mutation, generate the revised dst which is the mode of dm_om_numeric
#=============================
# Recalculation: Revised dst
# max(multimode)
#=============================
# Get multimode for dm_om_numeric column
dm_om_multimode_LF3 = gene_LF3.groupby('mutationinformation')['dm_om_numeric_orig'].agg(multimode)
dm_om_multimode_LF3
# COPY mutationinformation for sanity check # Fill using multimode ONLY where NA in dst_multimode column
#data['mutation'] = data['mutationinformation'] gene_LF3['dst_multimode'] = gene_LF3['dst_multimode'].fillna(dm_om_multimode_LF3)
gene_LF0['mutationinformation'] = gene_LF0['mutation']
# Now get the max from multimode
gene_LF3['dst_noNA'] = gene_LF3['dst_multimode'].apply(lambda x: np.nanmax(x))
print(gene_LF3)
#----------------------------
# Revised dst column: Max
#----------------------------
# Finally created a revised dst with the max from the multimode
gene_LF3['dst_mode'] = gene_LF3.groupby('mutationinformation')['dst_noNA'].max()
#%% Multimode: drtype
#=============================
# Recalculation: Revised drtype
# max(multimode)
#=============================
#--------------------------------
# drtype: ALL values:
# numeric and names in an array
#--------------------------------
gene_LF3['drtype_all_vals'] = gene_LF3['drtype_numeric']
gene_LF3['drtype_all_names'] = gene_LF3['drtype']
gene_LF3['drtype_all_vals'] = gene_LF3.groupby('mutationinformation').drtype_all_vals.apply(list)
gene_LF3['drtype_all_names'] = gene_LF3.groupby('mutationinformation').drtype_all_names.apply(list)
#---------------------------------
# Revised drtype: max(Multimode)
#--------------------------------
gene_LF3['drtype_multimode'] = gene_LF3.groupby(['mutationinformation'])['drtype_numeric'].agg(multimode)
gene_LF3['drtype_multimode']
# Now get the max from multimode
gene_LF3['drtype_mode'] = gene_LF3['drtype_multimode'].apply(lambda x: np.nanmax(x))
gene_LF3.head()
#----------------------
# Revised drtype: Max
#----------------------
gene_LF3.head()
gene_LF3['drtype_max'] = gene_LF3.groupby(['mutationinformation'])['drtype_numeric'].max()
gene_LF3.head()
#%% Reset index: original indices
#gene_LF3 = gene_LF3.reset_index()
gene_LF3.index
gene_LF3['mutationinformation'] = gene_LF3.index
gene_LF3 = gene_LF3.set_index(['index_orig'])
gene_LF3[['mutationinformation']]
gene_LF3.index
#%% Revised counts
gene_LF3['dst_mode'].value_counts()
gene_LF3[drug].value_counts()
print('\n------------------------------------------------------'
, '\nRevised counting: mutation_info i.e dm om column'
, '\n-----------------------------------------------------'
, '\n----------------------------------'
, '\nOriginal drug column count'
, '\n----------------------------------'
, gene_LF3[drug].value_counts()
, '\nTotal samples [original]:', gene_LF3[drug].value_counts().sum()
, '\n----------------------------------'
, '\nRevised drug column count'
, '\n----------------------------------'
, gene_LF3['dst_mode'].value_counts()
, '\nTotal samples [revised]:', gene_LF3['dst_mode'].value_counts().sum()
)
#%% FIXME: CHECK THIS, run this with mutation_info_REV
#---------------------------------------
# Create revised mutation_info_column
#---------------------------------------
# Note this is overriding, since downstream depends on it
# make a copy you if you need to keep that
# create a copy before running this
gene_LF3['mutation_info_labels_orig'] = gene_LF3['mutation_info_labels']
gene_LF3['mutation_info_labels'] = gene_LF3['dst_mode'].map({1: 'DM'
, 0: 'OM'})
gene_LF3['mutation_info_labels_orig'].value_counts()
gene_LF3['mutation_info_labels_orig'].value_counts().sum()
gene_LF3['mutation_info_labels'].value_counts()
gene_LF3['mutation_info_labels'].value_counts().sum()
#%% TEST muts
test_muts = ['G132A', 'V180F', 'G108R', 'A102P']
test_muts = ['L4S', 'L4W', 'A102P']
gene_LF3 = gene_LF2[gene_LF2.loc[:,'mutationinformation'].isin(test_muts)]
gene_LF4 = gene_LF3[['id', 'mutationinformation', drug
, 'mutation_info_orig', 'dm_om_numeric_orig', 'dst', 'dst_multimode']]
# Reset index as it allows the groupby expression to directly map
gene_LF4.index
gene_LF4= gene_LF4.set_index(['mutationinformation'])
gene_LF4.index
# Get multimode for dm_om_numeric column # Get multimode for dm_om_numeric column
dm_om_multimode = gene_LF0.groupby('mutationinformation')['dm_om_numeric'].agg(multimode) dm_om_multimode_LF4 = gene_LF4.groupby('mutationinformation')['dm_om_numeric_orig'].agg(multimode)
dm_om_multimode dm_om_multimode_LF4
gene_LF0['dst_multimode'] = gene_LF0['dst_multimode'].fillna(dm_om_multimode) gene_LF4['dst_multimode'] = gene_LF4['dst_multimode'].fillna(dm_om_multimode_LF4)
gene_LF0['dst_noNA'] = gene_LF0['dst_multimode'].apply(lambda x: np.nanmax(x))
print(gene_LF0)
# Finally created a revised dst with the max from the multimode # LINEAGE
gene_LF0['dst_mode'] = gene_LF0.groupby('mutationinformation')['dst_noNA'].max() foo = gene_LF2.copy()
foo = foo[foo.loc[:,'mutationinformation'].isin(test_muts)]
foo = foo[['id', 'mutationinformation','lineage' ]]
foo['MUT'] = foo['mutationinformation']
foo['lineage'] = foo['lineage'].str.strip()
foo['lineage_corrupt'] = foo['lineage']
#foo['lineage_ucount'] = foo['mutationinformation'].map(foo.groupby('mutationinformation')['lineage'].nunique())# seems wrong!
foo2 = tidy_split(foo, 'lineage_corrupt', sep = ';')
foo2['lineage_corrupt'] = foo2['lineage_corrupt'].str.strip()
foo2['lineage_corrupt_list'] = foo2['mutationinformation'].map(foo2.groupby('mutationinformation')['lineage_corrupt'].apply(list))
foo2['lineage_corrupt_ucount'] = foo2['mutationinformation'].map(foo2.groupby('mutationinformation')['lineage_corrupt'].nunique())
foo2.groupby('mutationinformation')['lineage_corrupt'].value_counts()
foo2['lineage_corrupt'].value_counts()
foo2['lineage_corrupt_ucount']
foo2.index
foo2 = foo2.set_index(['mutationinformation'])
# now merge
foo.index
foo.index.nunique()
foo2.index.nunique()
foo_copy = foo.copy()
foo_copy['lineage_ucount'] = foo_copy['lineage']
foo_copy.loc[foo2.index, 'lineage_ucount'] = foo2['lineage_corrupt_ucount']
#%%FIXME: do regex for lineage for meta data else the ; messes it up
#--------------------------
# lineage multimode mode
#--------------------------
lineage_label_map = {'lineage1' : 'L1'
, 'lineage2' : 'L2'
, 'lineage3' : 'L3'
, 'lineage4' : 'L4'
, 'lineage5' : 'L5'
, 'lineage6' : 'L6'
, 'lineage7' : 'L7'
, 'lineageBOV' : 'LBOV'}
foo['lineage'].value_counts()
lineage_label_numeric = {'lineage1' : 1
, 'lineage2' : 2
, 'lineage3' : 3
, 'lineage4' : 4
, 'lineage5' : 5
, 'lineage6' : 6
, 'lineage7' : 7
, 'lineageBOV' : 8}
lineage_label_numeric
foo2['lineage_corrupt'].value_counts()
foo2['lineage_numeric'] = foo2['lineage_corrupt'].map(lineage_label_numeric)
foo2['lineage_numeric'].value_counts()
foo2['lineage_numeric_list'] = foo2['mutationinformation'].map(foo2.groupby('mutationinformation')['lineage_numeric'].apply(list))
foo2['lineage_numeric_list']
foo2['lineage_multimode'] = foo2.groupby(['mutationinformation'])['lineage_numeric'].agg(multimode)
c2 = foo2[foo2.loc[:, 'MUT'].isin(['A102P'])]
c2['lineage_numeric'].value_counts()
#%% Lineage counts (including the ones containing multiple entries)
# Get information about how many distinct lineages each mutation comes from
gene_LF3['lineage'].value_counts()
gene_LF3['lineage'] = gene_LF3['lineage'].str.strip()
gene_LF3['lineage'].value_counts()
# Create a column: lineage_corrupt
gene_LF3['lineage_corrupt'] = gene_LF3['lineage']
# Create df with tidy_split: lineage
lf_lin_split = tidy_split(gene_LF3, 'lineage_corrupt', sep = ';')
lf_lin_split['lineage_corrupt'] = lf_lin_split['lineage_corrupt'].str.strip()
lf_lin_split['lineage_corrupt_list'] = lf_lin_split['mutationinformation'].map(lf_lin_split.groupby('mutationinformation')['lineage_corrupt'].apply(list))
lf_lin_split['lineage_corrupt_ucount'] = lf_lin_split['mutationinformation'].map(lf_lin_split.groupby('mutationinformation')['lineage_corrupt'].nunique())
#----------------------------------
# Merge with gene_LF3 with
# lf_lin_split
#-----------------------------------
gene_LF3['lineage_ucount'] = gene_LF3['lineage']
# quick checks
gene_LF3['lineage_ucount'].equals(gene_LF3['lineage'])
# merge based on indices
gene_LF3.index.nunique()
lf_lin_split.index.nunique()
all(gene_LF3.index.isin(lf_lin_split.index))
all(lf_lin_split.index.isin(gene_LF3.index))
# magic merge happens here
gene_LF3.loc[lf_lin_split.index, 'lineage_ucount'] = lf_lin_split['lineage_corrupt_ucount']
#%% sanity checks
check1 = gene_LF3[['mutationinformation', 'lineage', 'lineage_ucount']]
check2 = check1[check1.loc[:, 'mutationinformation'].isin(['H57D'])]
check2.value_counts()
#%%