saved section for generating revised dst
This commit is contained in:
parent
cb93cef3c7
commit
0867827ec6
1 changed files with 156 additions and 87 deletions
|
@ -1238,10 +1238,12 @@ del(out_filename_metadata_poscounts)
|
||||||
#%% Add column: aa_calcprop
|
#%% Add column: aa_calcprop
|
||||||
#%% NEW mappings: gene_LF2
|
#%% NEW mappings: gene_LF2
|
||||||
# gene_LF2: copy gene_LF1
|
# gene_LF2: copy gene_LF1
|
||||||
gene_LF2 = gene_LF1.copy()
|
gene_LF2 = gene_LF1.copy()gene_LF3.groupby('mutationinformation').drtype_all_vals.apply(list)
|
||||||
|
gene_LF2.index
|
||||||
|
|
||||||
#%% Add total unique id count
|
#%% Add total unique id count
|
||||||
gene_LF2['id'].nunique()
|
gene_LF2['id'].nunique()
|
||||||
gene_LF2['mutationinformation'].nunique()
|
gene_LF2['Mut'].nunique()
|
||||||
total_id_ucount = gene_LF2['id'].nunique()
|
total_id_ucount = gene_LF2['id'].nunique()
|
||||||
total_id_ucount
|
total_id_ucount
|
||||||
total_id_ucount2 = gene_LF2['sample'].nunique()
|
total_id_ucount2 = gene_LF2['sample'].nunique()
|
||||||
|
@ -1275,37 +1277,23 @@ gene_LF2['total_id_ucount'] = total_id_ucount
|
||||||
gene_LF2['maf'] = gene_LF2['snp_frequency']/gene_LF2['total_id_ucount']
|
gene_LF2['maf'] = gene_LF2['snp_frequency']/gene_LF2['total_id_ucount']
|
||||||
gene_LF2['maf'].head()
|
gene_LF2['maf'].head()
|
||||||
|
|
||||||
#%% Mapping 1: column '<drug>', mutation_info
|
#%% Further mappings: gene_LF3, with mutationinformation as INDEX
|
||||||
gene_LF2['mutation_info'].value_counts()
|
gene_LF3 = gene_LF2.copy()
|
||||||
gene_LF2['mutation_info_v1'].value_counts()
|
|
||||||
gene_LF2['mutation_info_orig'].value_counts()
|
|
||||||
|
|
||||||
#=======================
|
# Assign index: mutationinformation for mapping
|
||||||
# column name: <drug>
|
gene_LF3 = gene_LF3.set_index(['mutationinformation'])
|
||||||
#=======================
|
gene_LF3.index
|
||||||
# mapping 1.1: labels
|
gene_LF3['id'].nunique()
|
||||||
dm_om_label_map = {dr_muts_col: 'DM'
|
gene_LF3['Mut'].nunique()
|
||||||
, other_muts_col: 'OM'}
|
gene_LF3.index.nunique()
|
||||||
dm_om_label_map
|
|
||||||
gene_LF2['mutation_info_labels'] = gene_LF2['mutation_info'].map(dm_om_label_map)
|
|
||||||
|
|
||||||
# mapping 1.2: numeric
|
all(gene_LF3['Mut'] == gene_LF3.index)
|
||||||
dm_om_num_map = {dr_muts_col: 1
|
|
||||||
, other_muts_col: 0}
|
|
||||||
|
|
||||||
gene_LF2['dm_om_numeric'] = gene_LF2['mutation_info'].map(dm_om_num_map)
|
#%% Mapping 1: column '<drtype>'
|
||||||
gene_LF2['dm_om_numeric_orig'] = gene_LF2['mutation_info_orig'].map(dm_om_num_map)
|
|
||||||
|
|
||||||
gene_LF2['mutation_info'].value_counts()
|
|
||||||
gene_LF2['mutation_info_labels'].value_counts()
|
|
||||||
gene_LF2['mutation_info_orig'].value_counts()
|
|
||||||
gene_LF2['dm_om_numeric'].value_counts()
|
|
||||||
gene_LF2['dm_om_numeric_orig'].value_counts()
|
|
||||||
#%% Mapping 2: column '<drtype>'
|
|
||||||
#============================
|
#============================
|
||||||
# column name: <drtype>
|
# column name: <drtype>
|
||||||
#============================
|
#============================
|
||||||
gene_LF2['drtype'].value_counts()
|
gene_LF3['drtype'].value_counts()
|
||||||
|
|
||||||
# mapping 2.1: numeric
|
# mapping 2.1: numeric
|
||||||
drtype_map = {'XDR': 5
|
drtype_map = {'XDR': 5
|
||||||
|
@ -1315,55 +1303,10 @@ drtype_map = {'XDR': 5
|
||||||
, 'Other': 1
|
, 'Other': 1
|
||||||
, 'Sensitive': 0}
|
, 'Sensitive': 0}
|
||||||
|
|
||||||
gene_LF2['drtype_numeric'] = gene_LF2['drtype'].map(drtype_map)
|
gene_LF3['drtype_numeric'] = gene_LF3['drtype'].map(drtype_map)
|
||||||
|
|
||||||
gene_LF2['drtype'].value_counts()
|
gene_LF3['drtype'].value_counts()
|
||||||
gene_LF2['drtype_numeric'].value_counts()
|
gene_LF3['drtype_numeric'].value_counts()
|
||||||
#%% Mapping 3: column '<dst>', drug
|
|
||||||
#============================
|
|
||||||
# column name: <dst>
|
|
||||||
#============================
|
|
||||||
# copy dst column
|
|
||||||
gene_LF2['dst'] = gene_LF2[drug] # to allow cross checking
|
|
||||||
gene_LF2['dst'].equals(gene_LF2[drug])
|
|
||||||
|
|
||||||
gene_LF2['dst_multimode'] = gene_LF2[drug]
|
|
||||||
|
|
||||||
gene_LF2[drug].isnull().sum()
|
|
||||||
gene_LF2['dst_multimode'].isnull().sum()
|
|
||||||
#%% Further mappings: gene_LF3
|
|
||||||
gene_LF3 = gene_LF2.copy()
|
|
||||||
gene_LF3.index
|
|
||||||
gene_LF3 = gene_LF3.set_index(['Mut'])
|
|
||||||
gene_LF3.index
|
|
||||||
|
|
||||||
gene_LF3['dst_multimode'].value_counts()
|
|
||||||
gene_LF3['dst_multimode'].value_counts().sum()
|
|
||||||
#%% Multimode: dst
|
|
||||||
# For each mutation, generate the revised dst which is the mode of dm_om_numeric
|
|
||||||
#=============================
|
|
||||||
# Recalculation: Revised dst
|
|
||||||
# max(multimode)
|
|
||||||
#=============================
|
|
||||||
# Get multimode for dm_om_numeric column
|
|
||||||
#dm_om_multimode_LF3 = gene_LF3.groupby('mutationinformation')['dm_om_numeric_orig'].agg(multimode)
|
|
||||||
dm_om_multimode_LF3 = gene_LF3.groupby('mutationinformation')['dm_om_numeric'].agg(multimode)
|
|
||||||
|
|
||||||
dm_om_multimode_LF3
|
|
||||||
|
|
||||||
# Fill using multimode ONLY where NA in dst_multimode column
|
|
||||||
gene_LF3['dst_multimode'] = gene_LF3['dst_multimode'].fillna(dm_om_multimode_LF3)
|
|
||||||
|
|
||||||
# Now get the max from multimode
|
|
||||||
gene_LF3['dst_noNA'] = gene_LF3['dst_multimode'].apply(lambda x: np.nanmax(x))
|
|
||||||
print(gene_LF3)
|
|
||||||
#%% Revised Columns:IMPORTANT
|
|
||||||
#%% Multimode: dst column
|
|
||||||
#----------------------------
|
|
||||||
# Revised dst column: Max
|
|
||||||
#----------------------------
|
|
||||||
# Finally created a revised dst with the max from the multimode
|
|
||||||
gene_LF3['dst_mode'] = gene_LF3.groupby('mutationinformation')['dst_noNA'].max()
|
|
||||||
|
|
||||||
#%% Multimode: drtype
|
#%% Multimode: drtype
|
||||||
#=============================
|
#=============================
|
||||||
|
@ -1378,12 +1321,14 @@ gene_LF3['drtype_all_vals'] = gene_LF3['drtype_numeric']
|
||||||
gene_LF3['drtype_all_names'] = gene_LF3['drtype']
|
gene_LF3['drtype_all_names'] = gene_LF3['drtype']
|
||||||
|
|
||||||
gene_LF3['drtype_all_vals'] = gene_LF3.groupby('mutationinformation').drtype_all_vals.apply(list)
|
gene_LF3['drtype_all_vals'] = gene_LF3.groupby('mutationinformation').drtype_all_vals.apply(list)
|
||||||
gene_LF3['drtype_all_names'] = gene_LF3.groupby('mutationinformation').drtype_all_names.apply(list)
|
gene_LF3['drtype_all_vals'].head()
|
||||||
|
|
||||||
|
gene_LF3['drtype_all_names'] = gene_LF3.groupby('mutationinformation').drtype_all_names.apply(list)
|
||||||
|
gene_LF3['drtype_all_names'].head()
|
||||||
#---------------------------------
|
#---------------------------------
|
||||||
# Revised drtype: max(Multimode)
|
# Revised drtype: max(Multimode)
|
||||||
#--------------------------------
|
#--------------------------------
|
||||||
gene_LF3['drtype_multimode'] = gene_LF3.groupby(['mutationinformation'])['drtype_numeric'].agg(multimode)
|
gene_LF3['drtype_multimode'] = gene_LF3.groupby('mutationinformation')['drtype_numeric'].agg(multimode)
|
||||||
gene_LF3['drtype_multimode']
|
gene_LF3['drtype_multimode']
|
||||||
|
|
||||||
# Now get the max from multimode
|
# Now get the max from multimode
|
||||||
|
@ -1394,26 +1339,150 @@ gene_LF3.head()
|
||||||
# Revised drtype: Max
|
# Revised drtype: Max
|
||||||
#----------------------
|
#----------------------
|
||||||
gene_LF3.head()
|
gene_LF3.head()
|
||||||
gene_LF3['drtype_max'] = gene_LF3.groupby(['mutationinformation'])['drtype_numeric'].max()
|
gene_LF3['drtype_max'] = gene_LF3.groupby('mutationinformation')['drtype_numeric'].max()
|
||||||
gene_LF3.head()
|
gene_LF3.head()
|
||||||
|
|
||||||
#%% Revised counts checks
|
foo = gene_LF3[['Mut', 'position', 'drtype', 'drtype_multimode', 'drtype_mode', 'drtype_max']]
|
||||||
gene_LF3['dst_mode'].value_counts()
|
foo2 = foo.sort_values(['position', 'Mut'])
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
#%% Mapping 2: column '<dst>', drug
|
||||||
|
|
||||||
|
#=======================
|
||||||
|
# column name: <drug>
|
||||||
|
#=======================
|
||||||
|
# mapping 1.1: labels
|
||||||
|
dm_om_label_map = {dr_muts_col: 'DM'
|
||||||
|
, other_muts_col: 'OM'}
|
||||||
|
dm_om_label_map
|
||||||
|
gene_LF3['mutation_info_labels'] = gene_LF3['mutation_info'].map(dm_om_label_map)
|
||||||
|
|
||||||
|
# mapping 1.2: numeric
|
||||||
|
dm_om_num_map = {dr_muts_col: 1
|
||||||
|
, other_muts_col: 0}
|
||||||
|
|
||||||
|
gene_LF3['dm_om_numeric'] = gene_LF3['mutation_info'].map(dm_om_num_map)
|
||||||
|
gene_LF3['dm_om_numeric_orig'] = gene_LF3['mutation_info_orig'].map(dm_om_num_map)
|
||||||
|
|
||||||
|
gene_LF3['mutation_info'].value_counts()
|
||||||
|
gene_LF3['mutation_info_labels'].value_counts()
|
||||||
|
gene_LF3['mutation_info_orig'].value_counts()
|
||||||
|
gene_LF3['dm_om_numeric'].value_counts()
|
||||||
|
gene_LF3['dm_om_numeric_orig'].value_counts()
|
||||||
|
|
||||||
|
# Check value_counts: column '<drug>', mutation_info
|
||||||
|
gene_LF3['mutation_info'].value_counts()
|
||||||
|
gene_LF3['mutation_info_v1'].value_counts()
|
||||||
|
gene_LF3['mutation_info_orig'].value_counts()
|
||||||
|
|
||||||
|
#============================
|
||||||
|
# column name: <dst>
|
||||||
|
#============================
|
||||||
|
# copy dst column
|
||||||
|
gene_LF3['dst'] = gene_LF3[drug] # to allow cross checking
|
||||||
|
gene_LF3['dst'].equals(gene_LF3[drug])
|
||||||
|
|
||||||
|
gene_LF3['dst_multimode'] = gene_LF3[drug]
|
||||||
|
|
||||||
|
gene_LF3[drug].isnull().sum()
|
||||||
|
gene_LF3['dst_multimode'].isnull().sum()
|
||||||
|
|
||||||
|
gene_LF3['dst_multimode'].value_counts()
|
||||||
|
gene_LF3['dst_multimode'].value_counts().sum()
|
||||||
|
#%% Multimode: dst
|
||||||
|
# For each mutation, generate the revised dst which is the mode of dm_om_numeric
|
||||||
|
#=============================
|
||||||
|
# Recalculation: Revised dst
|
||||||
|
# max(multimode)
|
||||||
|
#=============================
|
||||||
|
# Get multimode for dm_om_numeric column
|
||||||
|
#dm_om_multimode_LF3 = gene_LF3.groupby('mutationinformation')['dm_om_numeric_orig'].agg(multimode)
|
||||||
|
dm_om_multimode_LF3 = gene_LF3.groupby('mutationinformation')['dm_om_numeric'].agg(multimode)
|
||||||
|
dm_om_multimode_LF3
|
||||||
|
dm_om_multimode_LF3.isnull().sum()
|
||||||
|
|
||||||
|
gene_LF3['dst_multimode_all'] = gene_LF3.groupby('mutationinformation')['dm_om_numeric'].agg(multimode)
|
||||||
|
gene_LF3['dst_multimode_all']
|
||||||
|
|
||||||
|
# Fill using multimode ONLY where NA in dst_multimode column
|
||||||
|
gene_LF3['dst_multimode'] = gene_LF3['dst_multimode'].fillna(dm_om_multimode_LF3)
|
||||||
|
gene_LF3['dst_multimode']
|
||||||
|
|
||||||
|
#----------------------------------
|
||||||
|
# Revised dst column: max of mode
|
||||||
|
#----------------------------------
|
||||||
|
# Finally created a revised dst with the max from the multimode
|
||||||
|
# Now get the max from multimode
|
||||||
|
#gene_LF3['dst_mode'] = gene_LF3.groupby('mutationinformation')['dst_noNA'].max() # this somehow is not right!
|
||||||
|
#gene_LF3['dst_noNA'] = gene_LF3['dst_multimode'].apply(lambda x: np.nanmax(x))
|
||||||
|
gene_LF3['dst_mode'] = gene_LF3['dst_multimode'].apply(lambda x: np.nanmax(x))
|
||||||
|
|
||||||
|
# sanity checks
|
||||||
|
gene_LF3['dst_noNA'].equals(gene_LF3['dst_mode'])
|
||||||
|
|
||||||
gene_LF3[drug].value_counts()
|
gene_LF3[drug].value_counts()
|
||||||
|
gene_LF3['dst_noNA'].value_counts()
|
||||||
|
gene_LF3['dst_mode'].value_counts()
|
||||||
|
|
||||||
|
foo = gene_LF3[['Mut', 'position', 'dst', 'dst_multimode', 'dst_noNA', 'dst_mode']]
|
||||||
|
foo2 = foo.sort_values(['position', 'Mut'])
|
||||||
|
|
||||||
print('\n------------------------------------------------------'
|
print('\n------------------------------------------------------'
|
||||||
, '\nRevised counting: mutation_info i.e dm om column'
|
, '\nRevised counting: mutation_info i.e dm om column\n'
|
||||||
, '\n-----------------------------------------------------'
|
, '\n-----------------------------------------------------\n'
|
||||||
|
|
||||||
, '\n----------------------------------'
|
, '\n----------------------------------'
|
||||||
, '\nOriginal drug column count'
|
, '\nOriginal drug column count'
|
||||||
, '\n----------------------------------'
|
, '\n----------------------------------\n'
|
||||||
, gene_LF3[drug].value_counts()
|
, gene_LF3[drug].value_counts()
|
||||||
, '\nTotal samples [original]:', gene_LF3[drug].value_counts().sum()
|
, '\nTotal samples [original]', gene_LF3[drug].value_counts().sum()
|
||||||
|
|
||||||
, '\n----------------------------------'
|
, '\n----------------------------------'
|
||||||
, '\nRevised drug column count'
|
, '\nRevised drug column count\n'
|
||||||
, '\n----------------------------------'
|
, '\n----------------------------------\n'
|
||||||
, gene_LF3['dst_mode'].value_counts()
|
, gene_LF3['dst_mode'].value_counts()
|
||||||
, '\nTotal samples [revised]:', gene_LF3['dst_mode'].value_counts().sum()
|
, '\nTotal samples [revised]', gene_LF3['dst_mode'].value_counts().sum()
|
||||||
|
|
||||||
|
, '\n----------------------------------'
|
||||||
|
, '\nRevised drug column count: dst_noNA\n'
|
||||||
|
, '\n----------------------------------\n'
|
||||||
|
, gene_LF3['dst_noNA'].value_counts()
|
||||||
)
|
)
|
||||||
|
#%% Create revised mutation_info_column based on dst_mode
|
||||||
|
#---------------------------------------
|
||||||
|
# Create revised mutation_info_column
|
||||||
|
#---------------------------------------
|
||||||
|
# Will need to overwrite column 'mutation_info_labels', since downstream depends on it
|
||||||
|
|
||||||
|
# Make a copy you before overwriting
|
||||||
|
#gene_LF3['mutation_info_labels_v1'] = gene_LF3['mutation_info'].map(dm_om_label_map)
|
||||||
|
gene_LF3['mutation_info_labels_v1'] = gene_LF3['mutation_info_labels']
|
||||||
|
gene_LF3['mutation_info_labels_v1'].value_counts() == gene_LF3['mutation_info_labels'].value_counts()
|
||||||
|
|
||||||
|
# Now overwrite
|
||||||
|
gene_LF3['mutation_info_labels_dst'] = gene_LF3['dst_mode'].map({1: 'DM', 0: 'OM'})
|
||||||
|
gene_LF3['mutation_info_labels'] = gene_LF3['dst_mode'].map({1: 'DM', 0: 'OM'})
|
||||||
|
if (gene_LF3['mutation_info_labels_dst'].value_counts() == gene_LF3['mutation_info_labels'].value_counts()):
|
||||||
|
print('\nRevised mutation_info colum created')
|
||||||
|
gene_LF3 = gene_LF3.drop(['mutation_info_labels_dst'], axis = 1)
|
||||||
|
else:
|
||||||
|
print('\nmutation info labels numbers mismatch'
|
||||||
|
, '\nPlease check section for mapping dst_mode to labels')
|
||||||
|
|
||||||
|
gene_LF3['mutation_info_orig'].value_counts()
|
||||||
|
#gene_LF3['mutation_info_labels'].value_counts()
|
||||||
|
gene_LF3['mutation_info_labels_orig'] = gene_LF3['mutation_info_orig'].map(dm_om_label_map)
|
||||||
|
gene_LF3['mutation_info_labels_orig'].value_counts()
|
||||||
|
|
||||||
|
# %% sanity check for the revised dst
|
||||||
|
gene_LF3[drug].value_counts()
|
||||||
|
gene_LF3[drug].value_counts().sum()
|
||||||
|
gene_LF3['mutation_info_labels_orig'].value_counts()
|
||||||
|
|
||||||
|
gene_LF3['dst_mode'].value_counts()
|
||||||
|
gene_LF3['dst_mode'].value_counts().sum()
|
||||||
|
|
||||||
|
# direct comparision
|
||||||
|
gene_LF3['dst'].value_counts()
|
||||||
|
gene_LF3['mutation_info_labels'].value_counts()
|
||||||
|
#%%
|
Loading…
Add table
Add a link
Reference in a new issue