saved section for generating revised dst

This commit is contained in:
Tanushree Tunstall 2022-04-25 16:51:28 +01:00
parent cb93cef3c7
commit 0867827ec6

View file

@ -1238,10 +1238,12 @@ del(out_filename_metadata_poscounts)
#%% Add column: aa_calcprop
#%% NEW mappings: gene_LF2
# gene_LF2: copy gene_LF1
gene_LF2 = gene_LF1.copy()
gene_LF2 = gene_LF1.copy()gene_LF3.groupby('mutationinformation').drtype_all_vals.apply(list)
gene_LF2.index
#%% Add total unique id count
gene_LF2['id'].nunique()
gene_LF2['mutationinformation'].nunique()
gene_LF2['Mut'].nunique()
total_id_ucount = gene_LF2['id'].nunique()
total_id_ucount
total_id_ucount2 = gene_LF2['sample'].nunique()
@ -1275,37 +1277,23 @@ gene_LF2['total_id_ucount'] = total_id_ucount
gene_LF2['maf'] = gene_LF2['snp_frequency']/gene_LF2['total_id_ucount']
gene_LF2['maf'].head()
#%% Mapping 1: column '<drug>', mutation_info
gene_LF2['mutation_info'].value_counts()
gene_LF2['mutation_info_v1'].value_counts()
gene_LF2['mutation_info_orig'].value_counts()
#%% Further mappings: gene_LF3, with mutationinformation as INDEX
gene_LF3 = gene_LF2.copy()
#=======================
# column name: <drug>
#=======================
# mapping 1.1: labels
dm_om_label_map = {dr_muts_col: 'DM'
, other_muts_col: 'OM'}
dm_om_label_map
gene_LF2['mutation_info_labels'] = gene_LF2['mutation_info'].map(dm_om_label_map)
# Assign index: mutationinformation for mapping
gene_LF3 = gene_LF3.set_index(['mutationinformation'])
gene_LF3.index
gene_LF3['id'].nunique()
gene_LF3['Mut'].nunique()
gene_LF3.index.nunique()
# mapping 1.2: numeric
dm_om_num_map = {dr_muts_col: 1
, other_muts_col: 0}
all(gene_LF3['Mut'] == gene_LF3.index)
gene_LF2['dm_om_numeric'] = gene_LF2['mutation_info'].map(dm_om_num_map)
gene_LF2['dm_om_numeric_orig'] = gene_LF2['mutation_info_orig'].map(dm_om_num_map)
gene_LF2['mutation_info'].value_counts()
gene_LF2['mutation_info_labels'].value_counts()
gene_LF2['mutation_info_orig'].value_counts()
gene_LF2['dm_om_numeric'].value_counts()
gene_LF2['dm_om_numeric_orig'].value_counts()
#%% Mapping 2: column '<drtype>'
#%% Mapping 1: column '<drtype>'
#============================
# column name: <drtype>
#============================
gene_LF2['drtype'].value_counts()
gene_LF3['drtype'].value_counts()
# mapping 2.1: numeric
drtype_map = {'XDR': 5
@ -1315,55 +1303,10 @@ drtype_map = {'XDR': 5
, 'Other': 1
, 'Sensitive': 0}
gene_LF2['drtype_numeric'] = gene_LF2['drtype'].map(drtype_map)
gene_LF3['drtype_numeric'] = gene_LF3['drtype'].map(drtype_map)
gene_LF2['drtype'].value_counts()
gene_LF2['drtype_numeric'].value_counts()
#%% Mapping 3: column '<dst>', drug
#============================
# column name: <dst>
#============================
# copy dst column
gene_LF2['dst'] = gene_LF2[drug] # to allow cross checking
gene_LF2['dst'].equals(gene_LF2[drug])
gene_LF2['dst_multimode'] = gene_LF2[drug]
gene_LF2[drug].isnull().sum()
gene_LF2['dst_multimode'].isnull().sum()
#%% Further mappings: gene_LF3
gene_LF3 = gene_LF2.copy()
gene_LF3.index
gene_LF3 = gene_LF3.set_index(['Mut'])
gene_LF3.index
gene_LF3['dst_multimode'].value_counts()
gene_LF3['dst_multimode'].value_counts().sum()
#%% Multimode: dst
# For each mutation, generate the revised dst which is the mode of dm_om_numeric
#=============================
# Recalculation: Revised dst
# max(multimode)
#=============================
# Get multimode for dm_om_numeric column
#dm_om_multimode_LF3 = gene_LF3.groupby('mutationinformation')['dm_om_numeric_orig'].agg(multimode)
dm_om_multimode_LF3 = gene_LF3.groupby('mutationinformation')['dm_om_numeric'].agg(multimode)
dm_om_multimode_LF3
# Fill using multimode ONLY where NA in dst_multimode column
gene_LF3['dst_multimode'] = gene_LF3['dst_multimode'].fillna(dm_om_multimode_LF3)
# Now get the max from multimode
gene_LF3['dst_noNA'] = gene_LF3['dst_multimode'].apply(lambda x: np.nanmax(x))
print(gene_LF3)
#%% Revised Columns:IMPORTANT
#%% Multimode: dst column
#----------------------------
# Revised dst column: Max
#----------------------------
# Finally created a revised dst with the max from the multimode
gene_LF3['dst_mode'] = gene_LF3.groupby('mutationinformation')['dst_noNA'].max()
gene_LF3['drtype'].value_counts()
gene_LF3['drtype_numeric'].value_counts()
#%% Multimode: drtype
#=============================
@ -1378,12 +1321,14 @@ gene_LF3['drtype_all_vals'] = gene_LF3['drtype_numeric']
gene_LF3['drtype_all_names'] = gene_LF3['drtype']
gene_LF3['drtype_all_vals'] = gene_LF3.groupby('mutationinformation').drtype_all_vals.apply(list)
gene_LF3['drtype_all_names'] = gene_LF3.groupby('mutationinformation').drtype_all_names.apply(list)
gene_LF3['drtype_all_vals'].head()
gene_LF3['drtype_all_names'] = gene_LF3.groupby('mutationinformation').drtype_all_names.apply(list)
gene_LF3['drtype_all_names'].head()
#---------------------------------
# Revised drtype: max(Multimode)
#--------------------------------
gene_LF3['drtype_multimode'] = gene_LF3.groupby(['mutationinformation'])['drtype_numeric'].agg(multimode)
gene_LF3['drtype_multimode'] = gene_LF3.groupby('mutationinformation')['drtype_numeric'].agg(multimode)
gene_LF3['drtype_multimode']
# Now get the max from multimode
@ -1394,26 +1339,150 @@ gene_LF3.head()
# Revised drtype: Max
#----------------------
gene_LF3.head()
gene_LF3['drtype_max'] = gene_LF3.groupby(['mutationinformation'])['drtype_numeric'].max()
gene_LF3['drtype_max'] = gene_LF3.groupby('mutationinformation')['drtype_numeric'].max()
gene_LF3.head()
#%% Revised counts checks
gene_LF3['dst_mode'].value_counts()
foo = gene_LF3[['Mut', 'position', 'drtype', 'drtype_multimode', 'drtype_mode', 'drtype_max']]
foo2 = foo.sort_values(['position', 'Mut'])
###############################################################################
#%% Mapping 2: column '<dst>', drug
#=======================
# column name: <drug>
#=======================
# mapping 1.1: labels
dm_om_label_map = {dr_muts_col: 'DM'
, other_muts_col: 'OM'}
dm_om_label_map
gene_LF3['mutation_info_labels'] = gene_LF3['mutation_info'].map(dm_om_label_map)
# mapping 1.2: numeric
dm_om_num_map = {dr_muts_col: 1
, other_muts_col: 0}
gene_LF3['dm_om_numeric'] = gene_LF3['mutation_info'].map(dm_om_num_map)
gene_LF3['dm_om_numeric_orig'] = gene_LF3['mutation_info_orig'].map(dm_om_num_map)
gene_LF3['mutation_info'].value_counts()
gene_LF3['mutation_info_labels'].value_counts()
gene_LF3['mutation_info_orig'].value_counts()
gene_LF3['dm_om_numeric'].value_counts()
gene_LF3['dm_om_numeric_orig'].value_counts()
# Check value_counts: column '<drug>', mutation_info
gene_LF3['mutation_info'].value_counts()
gene_LF3['mutation_info_v1'].value_counts()
gene_LF3['mutation_info_orig'].value_counts()
#============================
# column name: <dst>
#============================
# copy dst column
gene_LF3['dst'] = gene_LF3[drug] # to allow cross checking
gene_LF3['dst'].equals(gene_LF3[drug])
gene_LF3['dst_multimode'] = gene_LF3[drug]
gene_LF3[drug].isnull().sum()
gene_LF3['dst_multimode'].isnull().sum()
gene_LF3['dst_multimode'].value_counts()
gene_LF3['dst_multimode'].value_counts().sum()
#%% Multimode: dst
# For each mutation, generate the revised dst which is the mode of dm_om_numeric
#=============================
# Recalculation: Revised dst
# max(multimode)
#=============================
# Get multimode for dm_om_numeric column
#dm_om_multimode_LF3 = gene_LF3.groupby('mutationinformation')['dm_om_numeric_orig'].agg(multimode)
dm_om_multimode_LF3 = gene_LF3.groupby('mutationinformation')['dm_om_numeric'].agg(multimode)
dm_om_multimode_LF3
dm_om_multimode_LF3.isnull().sum()
gene_LF3['dst_multimode_all'] = gene_LF3.groupby('mutationinformation')['dm_om_numeric'].agg(multimode)
gene_LF3['dst_multimode_all']
# Fill using multimode ONLY where NA in dst_multimode column
gene_LF3['dst_multimode'] = gene_LF3['dst_multimode'].fillna(dm_om_multimode_LF3)
gene_LF3['dst_multimode']
#----------------------------------
# Revised dst column: max of mode
#----------------------------------
# Finally created a revised dst with the max from the multimode
# Now get the max from multimode
#gene_LF3['dst_mode'] = gene_LF3.groupby('mutationinformation')['dst_noNA'].max() # this somehow is not right!
#gene_LF3['dst_noNA'] = gene_LF3['dst_multimode'].apply(lambda x: np.nanmax(x))
gene_LF3['dst_mode'] = gene_LF3['dst_multimode'].apply(lambda x: np.nanmax(x))
# sanity checks
gene_LF3['dst_noNA'].equals(gene_LF3['dst_mode'])
gene_LF3[drug].value_counts()
gene_LF3['dst_noNA'].value_counts()
gene_LF3['dst_mode'].value_counts()
foo = gene_LF3[['Mut', 'position', 'dst', 'dst_multimode', 'dst_noNA', 'dst_mode']]
foo2 = foo.sort_values(['position', 'Mut'])
print('\n------------------------------------------------------'
, '\nRevised counting: mutation_info i.e dm om column'
, '\n-----------------------------------------------------'
, '\nRevised counting: mutation_info i.e dm om column\n'
, '\n-----------------------------------------------------\n'
, '\n----------------------------------'
, '\nOriginal drug column count'
, '\n----------------------------------'
, '\n----------------------------------\n'
, gene_LF3[drug].value_counts()
, '\nTotal samples [original]:', gene_LF3[drug].value_counts().sum()
, '\nTotal samples [original]', gene_LF3[drug].value_counts().sum()
, '\n----------------------------------'
, '\nRevised drug column count'
, '\n----------------------------------'
, '\nRevised drug column count\n'
, '\n----------------------------------\n'
, gene_LF3['dst_mode'].value_counts()
, '\nTotal samples [revised]:', gene_LF3['dst_mode'].value_counts().sum()
, '\nTotal samples [revised]', gene_LF3['dst_mode'].value_counts().sum()
, '\n----------------------------------'
, '\nRevised drug column count: dst_noNA\n'
, '\n----------------------------------\n'
, gene_LF3['dst_noNA'].value_counts()
)
#%% Create revised mutation_info_column based on dst_mode
#---------------------------------------
# Create revised mutation_info_column
#---------------------------------------
# Will need to overwrite column 'mutation_info_labels', since downstream depends on it
# Make a copy you before overwriting
#gene_LF3['mutation_info_labels_v1'] = gene_LF3['mutation_info'].map(dm_om_label_map)
gene_LF3['mutation_info_labels_v1'] = gene_LF3['mutation_info_labels']
gene_LF3['mutation_info_labels_v1'].value_counts() == gene_LF3['mutation_info_labels'].value_counts()
# Now overwrite
gene_LF3['mutation_info_labels_dst'] = gene_LF3['dst_mode'].map({1: 'DM', 0: 'OM'})
gene_LF3['mutation_info_labels'] = gene_LF3['dst_mode'].map({1: 'DM', 0: 'OM'})
if (gene_LF3['mutation_info_labels_dst'].value_counts() == gene_LF3['mutation_info_labels'].value_counts()):
print('\nRevised mutation_info colum created')
gene_LF3 = gene_LF3.drop(['mutation_info_labels_dst'], axis = 1)
else:
print('\nmutation info labels numbers mismatch'
, '\nPlease check section for mapping dst_mode to labels')
gene_LF3['mutation_info_orig'].value_counts()
#gene_LF3['mutation_info_labels'].value_counts()
gene_LF3['mutation_info_labels_orig'] = gene_LF3['mutation_info_orig'].map(dm_om_label_map)
gene_LF3['mutation_info_labels_orig'].value_counts()
# %% sanity check for the revised dst
gene_LF3[drug].value_counts()
gene_LF3[drug].value_counts().sum()
gene_LF3['mutation_info_labels_orig'].value_counts()
gene_LF3['dst_mode'].value_counts()
gene_LF3['dst_mode'].value_counts().sum()
# direct comparision
gene_LF3['dst'].value_counts()
gene_LF3['mutation_info_labels'].value_counts()
#%%