saved section for generating revised dst
This commit is contained in:
parent
cb93cef3c7
commit
0867827ec6
1 changed files with 156 additions and 87 deletions
|
@ -1238,10 +1238,12 @@ del(out_filename_metadata_poscounts)
|
|||
#%% Add column: aa_calcprop
|
||||
#%% NEW mappings: gene_LF2
|
||||
# gene_LF2: copy gene_LF1
|
||||
gene_LF2 = gene_LF1.copy()
|
||||
gene_LF2 = gene_LF1.copy()gene_LF3.groupby('mutationinformation').drtype_all_vals.apply(list)
|
||||
gene_LF2.index
|
||||
|
||||
#%% Add total unique id count
|
||||
gene_LF2['id'].nunique()
|
||||
gene_LF2['mutationinformation'].nunique()
|
||||
gene_LF2['Mut'].nunique()
|
||||
total_id_ucount = gene_LF2['id'].nunique()
|
||||
total_id_ucount
|
||||
total_id_ucount2 = gene_LF2['sample'].nunique()
|
||||
|
@ -1275,37 +1277,23 @@ gene_LF2['total_id_ucount'] = total_id_ucount
|
|||
gene_LF2['maf'] = gene_LF2['snp_frequency']/gene_LF2['total_id_ucount']
|
||||
gene_LF2['maf'].head()
|
||||
|
||||
#%% Mapping 1: column '<drug>', mutation_info
|
||||
gene_LF2['mutation_info'].value_counts()
|
||||
gene_LF2['mutation_info_v1'].value_counts()
|
||||
gene_LF2['mutation_info_orig'].value_counts()
|
||||
#%% Further mappings: gene_LF3, with mutationinformation as INDEX
|
||||
gene_LF3 = gene_LF2.copy()
|
||||
|
||||
#=======================
|
||||
# column name: <drug>
|
||||
#=======================
|
||||
# mapping 1.1: labels
|
||||
dm_om_label_map = {dr_muts_col: 'DM'
|
||||
, other_muts_col: 'OM'}
|
||||
dm_om_label_map
|
||||
gene_LF2['mutation_info_labels'] = gene_LF2['mutation_info'].map(dm_om_label_map)
|
||||
# Assign index: mutationinformation for mapping
|
||||
gene_LF3 = gene_LF3.set_index(['mutationinformation'])
|
||||
gene_LF3.index
|
||||
gene_LF3['id'].nunique()
|
||||
gene_LF3['Mut'].nunique()
|
||||
gene_LF3.index.nunique()
|
||||
|
||||
# mapping 1.2: numeric
|
||||
dm_om_num_map = {dr_muts_col: 1
|
||||
, other_muts_col: 0}
|
||||
all(gene_LF3['Mut'] == gene_LF3.index)
|
||||
|
||||
gene_LF2['dm_om_numeric'] = gene_LF2['mutation_info'].map(dm_om_num_map)
|
||||
gene_LF2['dm_om_numeric_orig'] = gene_LF2['mutation_info_orig'].map(dm_om_num_map)
|
||||
|
||||
gene_LF2['mutation_info'].value_counts()
|
||||
gene_LF2['mutation_info_labels'].value_counts()
|
||||
gene_LF2['mutation_info_orig'].value_counts()
|
||||
gene_LF2['dm_om_numeric'].value_counts()
|
||||
gene_LF2['dm_om_numeric_orig'].value_counts()
|
||||
#%% Mapping 2: column '<drtype>'
|
||||
#%% Mapping 1: column '<drtype>'
|
||||
#============================
|
||||
# column name: <drtype>
|
||||
#============================
|
||||
gene_LF2['drtype'].value_counts()
|
||||
gene_LF3['drtype'].value_counts()
|
||||
|
||||
# mapping 2.1: numeric
|
||||
drtype_map = {'XDR': 5
|
||||
|
@ -1315,55 +1303,10 @@ drtype_map = {'XDR': 5
|
|||
, 'Other': 1
|
||||
, 'Sensitive': 0}
|
||||
|
||||
gene_LF2['drtype_numeric'] = gene_LF2['drtype'].map(drtype_map)
|
||||
gene_LF3['drtype_numeric'] = gene_LF3['drtype'].map(drtype_map)
|
||||
|
||||
gene_LF2['drtype'].value_counts()
|
||||
gene_LF2['drtype_numeric'].value_counts()
|
||||
#%% Mapping 3: column '<dst>', drug
|
||||
#============================
|
||||
# column name: <dst>
|
||||
#============================
|
||||
# copy dst column
|
||||
gene_LF2['dst'] = gene_LF2[drug] # to allow cross checking
|
||||
gene_LF2['dst'].equals(gene_LF2[drug])
|
||||
|
||||
gene_LF2['dst_multimode'] = gene_LF2[drug]
|
||||
|
||||
gene_LF2[drug].isnull().sum()
|
||||
gene_LF2['dst_multimode'].isnull().sum()
|
||||
#%% Further mappings: gene_LF3
|
||||
gene_LF3 = gene_LF2.copy()
|
||||
gene_LF3.index
|
||||
gene_LF3 = gene_LF3.set_index(['Mut'])
|
||||
gene_LF3.index
|
||||
|
||||
gene_LF3['dst_multimode'].value_counts()
|
||||
gene_LF3['dst_multimode'].value_counts().sum()
|
||||
#%% Multimode: dst
|
||||
# For each mutation, generate the revised dst which is the mode of dm_om_numeric
|
||||
#=============================
|
||||
# Recalculation: Revised dst
|
||||
# max(multimode)
|
||||
#=============================
|
||||
# Get multimode for dm_om_numeric column
|
||||
#dm_om_multimode_LF3 = gene_LF3.groupby('mutationinformation')['dm_om_numeric_orig'].agg(multimode)
|
||||
dm_om_multimode_LF3 = gene_LF3.groupby('mutationinformation')['dm_om_numeric'].agg(multimode)
|
||||
|
||||
dm_om_multimode_LF3
|
||||
|
||||
# Fill using multimode ONLY where NA in dst_multimode column
|
||||
gene_LF3['dst_multimode'] = gene_LF3['dst_multimode'].fillna(dm_om_multimode_LF3)
|
||||
|
||||
# Now get the max from multimode
|
||||
gene_LF3['dst_noNA'] = gene_LF3['dst_multimode'].apply(lambda x: np.nanmax(x))
|
||||
print(gene_LF3)
|
||||
#%% Revised Columns:IMPORTANT
|
||||
#%% Multimode: dst column
|
||||
#----------------------------
|
||||
# Revised dst column: Max
|
||||
#----------------------------
|
||||
# Finally created a revised dst with the max from the multimode
|
||||
gene_LF3['dst_mode'] = gene_LF3.groupby('mutationinformation')['dst_noNA'].max()
|
||||
gene_LF3['drtype'].value_counts()
|
||||
gene_LF3['drtype_numeric'].value_counts()
|
||||
|
||||
#%% Multimode: drtype
|
||||
#=============================
|
||||
|
@ -1378,12 +1321,14 @@ gene_LF3['drtype_all_vals'] = gene_LF3['drtype_numeric']
|
|||
gene_LF3['drtype_all_names'] = gene_LF3['drtype']
|
||||
|
||||
gene_LF3['drtype_all_vals'] = gene_LF3.groupby('mutationinformation').drtype_all_vals.apply(list)
|
||||
gene_LF3['drtype_all_names'] = gene_LF3.groupby('mutationinformation').drtype_all_names.apply(list)
|
||||
gene_LF3['drtype_all_vals'].head()
|
||||
|
||||
gene_LF3['drtype_all_names'] = gene_LF3.groupby('mutationinformation').drtype_all_names.apply(list)
|
||||
gene_LF3['drtype_all_names'].head()
|
||||
#---------------------------------
|
||||
# Revised drtype: max(Multimode)
|
||||
#--------------------------------
|
||||
gene_LF3['drtype_multimode'] = gene_LF3.groupby(['mutationinformation'])['drtype_numeric'].agg(multimode)
|
||||
gene_LF3['drtype_multimode'] = gene_LF3.groupby('mutationinformation')['drtype_numeric'].agg(multimode)
|
||||
gene_LF3['drtype_multimode']
|
||||
|
||||
# Now get the max from multimode
|
||||
|
@ -1394,26 +1339,150 @@ gene_LF3.head()
|
|||
# Revised drtype: Max
|
||||
#----------------------
|
||||
gene_LF3.head()
|
||||
gene_LF3['drtype_max'] = gene_LF3.groupby(['mutationinformation'])['drtype_numeric'].max()
|
||||
gene_LF3['drtype_max'] = gene_LF3.groupby('mutationinformation')['drtype_numeric'].max()
|
||||
gene_LF3.head()
|
||||
|
||||
#%% Revised counts checks
|
||||
gene_LF3['dst_mode'].value_counts()
|
||||
foo = gene_LF3[['Mut', 'position', 'drtype', 'drtype_multimode', 'drtype_mode', 'drtype_max']]
|
||||
foo2 = foo.sort_values(['position', 'Mut'])
|
||||
|
||||
###############################################################################
|
||||
#%% Mapping 2: column '<dst>', drug
|
||||
|
||||
#=======================
|
||||
# column name: <drug>
|
||||
#=======================
|
||||
# mapping 1.1: labels
|
||||
dm_om_label_map = {dr_muts_col: 'DM'
|
||||
, other_muts_col: 'OM'}
|
||||
dm_om_label_map
|
||||
gene_LF3['mutation_info_labels'] = gene_LF3['mutation_info'].map(dm_om_label_map)
|
||||
|
||||
# mapping 1.2: numeric
|
||||
dm_om_num_map = {dr_muts_col: 1
|
||||
, other_muts_col: 0}
|
||||
|
||||
gene_LF3['dm_om_numeric'] = gene_LF3['mutation_info'].map(dm_om_num_map)
|
||||
gene_LF3['dm_om_numeric_orig'] = gene_LF3['mutation_info_orig'].map(dm_om_num_map)
|
||||
|
||||
gene_LF3['mutation_info'].value_counts()
|
||||
gene_LF3['mutation_info_labels'].value_counts()
|
||||
gene_LF3['mutation_info_orig'].value_counts()
|
||||
gene_LF3['dm_om_numeric'].value_counts()
|
||||
gene_LF3['dm_om_numeric_orig'].value_counts()
|
||||
|
||||
# Check value_counts: column '<drug>', mutation_info
|
||||
gene_LF3['mutation_info'].value_counts()
|
||||
gene_LF3['mutation_info_v1'].value_counts()
|
||||
gene_LF3['mutation_info_orig'].value_counts()
|
||||
|
||||
#============================
|
||||
# column name: <dst>
|
||||
#============================
|
||||
# copy dst column
|
||||
gene_LF3['dst'] = gene_LF3[drug] # to allow cross checking
|
||||
gene_LF3['dst'].equals(gene_LF3[drug])
|
||||
|
||||
gene_LF3['dst_multimode'] = gene_LF3[drug]
|
||||
|
||||
gene_LF3[drug].isnull().sum()
|
||||
gene_LF3['dst_multimode'].isnull().sum()
|
||||
|
||||
gene_LF3['dst_multimode'].value_counts()
|
||||
gene_LF3['dst_multimode'].value_counts().sum()
|
||||
#%% Multimode: dst
|
||||
# For each mutation, generate the revised dst which is the mode of dm_om_numeric
|
||||
#=============================
|
||||
# Recalculation: Revised dst
|
||||
# max(multimode)
|
||||
#=============================
|
||||
# Get multimode for dm_om_numeric column
|
||||
#dm_om_multimode_LF3 = gene_LF3.groupby('mutationinformation')['dm_om_numeric_orig'].agg(multimode)
|
||||
dm_om_multimode_LF3 = gene_LF3.groupby('mutationinformation')['dm_om_numeric'].agg(multimode)
|
||||
dm_om_multimode_LF3
|
||||
dm_om_multimode_LF3.isnull().sum()
|
||||
|
||||
gene_LF3['dst_multimode_all'] = gene_LF3.groupby('mutationinformation')['dm_om_numeric'].agg(multimode)
|
||||
gene_LF3['dst_multimode_all']
|
||||
|
||||
# Fill using multimode ONLY where NA in dst_multimode column
|
||||
gene_LF3['dst_multimode'] = gene_LF3['dst_multimode'].fillna(dm_om_multimode_LF3)
|
||||
gene_LF3['dst_multimode']
|
||||
|
||||
#----------------------------------
|
||||
# Revised dst column: max of mode
|
||||
#----------------------------------
|
||||
# Finally created a revised dst with the max from the multimode
|
||||
# Now get the max from multimode
|
||||
#gene_LF3['dst_mode'] = gene_LF3.groupby('mutationinformation')['dst_noNA'].max() # this somehow is not right!
|
||||
#gene_LF3['dst_noNA'] = gene_LF3['dst_multimode'].apply(lambda x: np.nanmax(x))
|
||||
gene_LF3['dst_mode'] = gene_LF3['dst_multimode'].apply(lambda x: np.nanmax(x))
|
||||
|
||||
# sanity checks
|
||||
gene_LF3['dst_noNA'].equals(gene_LF3['dst_mode'])
|
||||
|
||||
gene_LF3[drug].value_counts()
|
||||
gene_LF3['dst_noNA'].value_counts()
|
||||
gene_LF3['dst_mode'].value_counts()
|
||||
|
||||
foo = gene_LF3[['Mut', 'position', 'dst', 'dst_multimode', 'dst_noNA', 'dst_mode']]
|
||||
foo2 = foo.sort_values(['position', 'Mut'])
|
||||
|
||||
print('\n------------------------------------------------------'
|
||||
, '\nRevised counting: mutation_info i.e dm om column'
|
||||
, '\n-----------------------------------------------------'
|
||||
, '\nRevised counting: mutation_info i.e dm om column\n'
|
||||
, '\n-----------------------------------------------------\n'
|
||||
|
||||
, '\n----------------------------------'
|
||||
, '\nOriginal drug column count'
|
||||
, '\n----------------------------------'
|
||||
, '\n----------------------------------\n'
|
||||
, gene_LF3[drug].value_counts()
|
||||
, '\nTotal samples [original]:', gene_LF3[drug].value_counts().sum()
|
||||
, '\nTotal samples [original]', gene_LF3[drug].value_counts().sum()
|
||||
|
||||
, '\n----------------------------------'
|
||||
, '\nRevised drug column count'
|
||||
, '\n----------------------------------'
|
||||
, '\nRevised drug column count\n'
|
||||
, '\n----------------------------------\n'
|
||||
, gene_LF3['dst_mode'].value_counts()
|
||||
, '\nTotal samples [revised]:', gene_LF3['dst_mode'].value_counts().sum()
|
||||
, '\nTotal samples [revised]', gene_LF3['dst_mode'].value_counts().sum()
|
||||
|
||||
, '\n----------------------------------'
|
||||
, '\nRevised drug column count: dst_noNA\n'
|
||||
, '\n----------------------------------\n'
|
||||
, gene_LF3['dst_noNA'].value_counts()
|
||||
)
|
||||
#%% Create revised mutation_info_column based on dst_mode
|
||||
#---------------------------------------
|
||||
# Create revised mutation_info_column
|
||||
#---------------------------------------
|
||||
# Will need to overwrite column 'mutation_info_labels', since downstream depends on it
|
||||
|
||||
# Make a copy you before overwriting
|
||||
#gene_LF3['mutation_info_labels_v1'] = gene_LF3['mutation_info'].map(dm_om_label_map)
|
||||
gene_LF3['mutation_info_labels_v1'] = gene_LF3['mutation_info_labels']
|
||||
gene_LF3['mutation_info_labels_v1'].value_counts() == gene_LF3['mutation_info_labels'].value_counts()
|
||||
|
||||
# Now overwrite
|
||||
gene_LF3['mutation_info_labels_dst'] = gene_LF3['dst_mode'].map({1: 'DM', 0: 'OM'})
|
||||
gene_LF3['mutation_info_labels'] = gene_LF3['dst_mode'].map({1: 'DM', 0: 'OM'})
|
||||
if (gene_LF3['mutation_info_labels_dst'].value_counts() == gene_LF3['mutation_info_labels'].value_counts()):
|
||||
print('\nRevised mutation_info colum created')
|
||||
gene_LF3 = gene_LF3.drop(['mutation_info_labels_dst'], axis = 1)
|
||||
else:
|
||||
print('\nmutation info labels numbers mismatch'
|
||||
, '\nPlease check section for mapping dst_mode to labels')
|
||||
|
||||
gene_LF3['mutation_info_orig'].value_counts()
|
||||
#gene_LF3['mutation_info_labels'].value_counts()
|
||||
gene_LF3['mutation_info_labels_orig'] = gene_LF3['mutation_info_orig'].map(dm_om_label_map)
|
||||
gene_LF3['mutation_info_labels_orig'].value_counts()
|
||||
|
||||
# %% sanity check for the revised dst
|
||||
gene_LF3[drug].value_counts()
|
||||
gene_LF3[drug].value_counts().sum()
|
||||
gene_LF3['mutation_info_labels_orig'].value_counts()
|
||||
|
||||
gene_LF3['dst_mode'].value_counts()
|
||||
gene_LF3['dst_mode'].value_counts().sum()
|
||||
|
||||
# direct comparision
|
||||
gene_LF3['dst'].value_counts()
|
||||
gene_LF3['mutation_info_labels'].value_counts()
|
||||
#%%
|
Loading…
Add table
Add a link
Reference in a new issue