From 0867827ec69d4a8deff968c9760999fd32c41b10 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Mon, 25 Apr 2022 16:51:28 +0100 Subject: [PATCH] saved section for generating revised dst --- scripts/data_extraction.py | 243 ++++++++++++++++++++++++------------- 1 file changed, 156 insertions(+), 87 deletions(-) diff --git a/scripts/data_extraction.py b/scripts/data_extraction.py index fb9449d..3d26352 100644 --- a/scripts/data_extraction.py +++ b/scripts/data_extraction.py @@ -1238,10 +1238,12 @@ del(out_filename_metadata_poscounts) #%% Add column: aa_calcprop #%% NEW mappings: gene_LF2 # gene_LF2: copy gene_LF1 -gene_LF2 = gene_LF1.copy() +gene_LF2 = gene_LF1.copy()gene_LF3.groupby('mutationinformation').drtype_all_vals.apply(list) +gene_LF2.index + #%% Add total unique id count gene_LF2['id'].nunique() -gene_LF2['mutationinformation'].nunique() +gene_LF2['Mut'].nunique() total_id_ucount = gene_LF2['id'].nunique() total_id_ucount total_id_ucount2 = gene_LF2['sample'].nunique() @@ -1275,37 +1277,23 @@ gene_LF2['total_id_ucount'] = total_id_ucount gene_LF2['maf'] = gene_LF2['snp_frequency']/gene_LF2['total_id_ucount'] gene_LF2['maf'].head() -#%% Mapping 1: column '', mutation_info -gene_LF2['mutation_info'].value_counts() -gene_LF2['mutation_info_v1'].value_counts() -gene_LF2['mutation_info_orig'].value_counts() +#%% Further mappings: gene_LF3, with mutationinformation as INDEX +gene_LF3 = gene_LF2.copy() -#======================= -# column name: -#======================= -# mapping 1.1: labels -dm_om_label_map = {dr_muts_col: 'DM' - , other_muts_col: 'OM'} -dm_om_label_map -gene_LF2['mutation_info_labels'] = gene_LF2['mutation_info'].map(dm_om_label_map) +# Assign index: mutationinformation for mapping +gene_LF3 = gene_LF3.set_index(['mutationinformation']) +gene_LF3.index +gene_LF3['id'].nunique() +gene_LF3['Mut'].nunique() +gene_LF3.index.nunique() -# mapping 1.2: numeric -dm_om_num_map = {dr_muts_col: 1 - , other_muts_col: 0} +all(gene_LF3['Mut'] == gene_LF3.index) -gene_LF2['dm_om_numeric'] = gene_LF2['mutation_info'].map(dm_om_num_map) -gene_LF2['dm_om_numeric_orig'] = gene_LF2['mutation_info_orig'].map(dm_om_num_map) - -gene_LF2['mutation_info'].value_counts() -gene_LF2['mutation_info_labels'].value_counts() -gene_LF2['mutation_info_orig'].value_counts() -gene_LF2['dm_om_numeric'].value_counts() -gene_LF2['dm_om_numeric_orig'].value_counts() -#%% Mapping 2: column '' +#%% Mapping 1: column '' #============================ # column name: #============================ -gene_LF2['drtype'].value_counts() +gene_LF3['drtype'].value_counts() # mapping 2.1: numeric drtype_map = {'XDR': 5 @@ -1315,55 +1303,10 @@ drtype_map = {'XDR': 5 , 'Other': 1 , 'Sensitive': 0} -gene_LF2['drtype_numeric'] = gene_LF2['drtype'].map(drtype_map) +gene_LF3['drtype_numeric'] = gene_LF3['drtype'].map(drtype_map) -gene_LF2['drtype'].value_counts() -gene_LF2['drtype_numeric'].value_counts() -#%% Mapping 3: column '', drug -#============================ -# column name: -#============================ -# copy dst column -gene_LF2['dst'] = gene_LF2[drug] # to allow cross checking -gene_LF2['dst'].equals(gene_LF2[drug]) - -gene_LF2['dst_multimode'] = gene_LF2[drug] - -gene_LF2[drug].isnull().sum() -gene_LF2['dst_multimode'].isnull().sum() -#%% Further mappings: gene_LF3 -gene_LF3 = gene_LF2.copy() -gene_LF3.index -gene_LF3 = gene_LF3.set_index(['Mut']) -gene_LF3.index - -gene_LF3['dst_multimode'].value_counts() -gene_LF3['dst_multimode'].value_counts().sum() -#%% Multimode: dst -# For each mutation, generate the revised dst which is the mode of dm_om_numeric -#============================= -# Recalculation: Revised dst -# max(multimode) -#============================= -# Get multimode for dm_om_numeric column -#dm_om_multimode_LF3 = gene_LF3.groupby('mutationinformation')['dm_om_numeric_orig'].agg(multimode) -dm_om_multimode_LF3 = gene_LF3.groupby('mutationinformation')['dm_om_numeric'].agg(multimode) - -dm_om_multimode_LF3 - -# Fill using multimode ONLY where NA in dst_multimode column -gene_LF3['dst_multimode'] = gene_LF3['dst_multimode'].fillna(dm_om_multimode_LF3) - -# Now get the max from multimode -gene_LF3['dst_noNA'] = gene_LF3['dst_multimode'].apply(lambda x: np.nanmax(x)) -print(gene_LF3) -#%% Revised Columns:IMPORTANT -#%% Multimode: dst column -#---------------------------- -# Revised dst column: Max -#---------------------------- -# Finally created a revised dst with the max from the multimode -gene_LF3['dst_mode'] = gene_LF3.groupby('mutationinformation')['dst_noNA'].max() +gene_LF3['drtype'].value_counts() +gene_LF3['drtype_numeric'].value_counts() #%% Multimode: drtype #============================= @@ -1378,12 +1321,14 @@ gene_LF3['drtype_all_vals'] = gene_LF3['drtype_numeric'] gene_LF3['drtype_all_names'] = gene_LF3['drtype'] gene_LF3['drtype_all_vals'] = gene_LF3.groupby('mutationinformation').drtype_all_vals.apply(list) -gene_LF3['drtype_all_names'] = gene_LF3.groupby('mutationinformation').drtype_all_names.apply(list) +gene_LF3['drtype_all_vals'].head() +gene_LF3['drtype_all_names'] = gene_LF3.groupby('mutationinformation').drtype_all_names.apply(list) +gene_LF3['drtype_all_names'].head() #--------------------------------- # Revised drtype: max(Multimode) #-------------------------------- -gene_LF3['drtype_multimode'] = gene_LF3.groupby(['mutationinformation'])['drtype_numeric'].agg(multimode) +gene_LF3['drtype_multimode'] = gene_LF3.groupby('mutationinformation')['drtype_numeric'].agg(multimode) gene_LF3['drtype_multimode'] # Now get the max from multimode @@ -1394,26 +1339,150 @@ gene_LF3.head() # Revised drtype: Max #---------------------- gene_LF3.head() -gene_LF3['drtype_max'] = gene_LF3.groupby(['mutationinformation'])['drtype_numeric'].max() +gene_LF3['drtype_max'] = gene_LF3.groupby('mutationinformation')['drtype_numeric'].max() gene_LF3.head() -#%% Revised counts checks -gene_LF3['dst_mode'].value_counts() +foo = gene_LF3[['Mut', 'position', 'drtype', 'drtype_multimode', 'drtype_mode', 'drtype_max']] +foo2 = foo.sort_values(['position', 'Mut']) + +############################################################################### +#%% Mapping 2: column '', drug + +#======================= +# column name: +#======================= +# mapping 1.1: labels +dm_om_label_map = {dr_muts_col: 'DM' + , other_muts_col: 'OM'} +dm_om_label_map +gene_LF3['mutation_info_labels'] = gene_LF3['mutation_info'].map(dm_om_label_map) + +# mapping 1.2: numeric +dm_om_num_map = {dr_muts_col: 1 + , other_muts_col: 0} + +gene_LF3['dm_om_numeric'] = gene_LF3['mutation_info'].map(dm_om_num_map) +gene_LF3['dm_om_numeric_orig'] = gene_LF3['mutation_info_orig'].map(dm_om_num_map) + +gene_LF3['mutation_info'].value_counts() +gene_LF3['mutation_info_labels'].value_counts() +gene_LF3['mutation_info_orig'].value_counts() +gene_LF3['dm_om_numeric'].value_counts() +gene_LF3['dm_om_numeric_orig'].value_counts() + +# Check value_counts: column '', mutation_info +gene_LF3['mutation_info'].value_counts() +gene_LF3['mutation_info_v1'].value_counts() +gene_LF3['mutation_info_orig'].value_counts() + +#============================ +# column name: +#============================ +# copy dst column +gene_LF3['dst'] = gene_LF3[drug] # to allow cross checking +gene_LF3['dst'].equals(gene_LF3[drug]) + +gene_LF3['dst_multimode'] = gene_LF3[drug] + +gene_LF3[drug].isnull().sum() +gene_LF3['dst_multimode'].isnull().sum() + +gene_LF3['dst_multimode'].value_counts() +gene_LF3['dst_multimode'].value_counts().sum() +#%% Multimode: dst +# For each mutation, generate the revised dst which is the mode of dm_om_numeric +#============================= +# Recalculation: Revised dst +# max(multimode) +#============================= +# Get multimode for dm_om_numeric column +#dm_om_multimode_LF3 = gene_LF3.groupby('mutationinformation')['dm_om_numeric_orig'].agg(multimode) +dm_om_multimode_LF3 = gene_LF3.groupby('mutationinformation')['dm_om_numeric'].agg(multimode) +dm_om_multimode_LF3 +dm_om_multimode_LF3.isnull().sum() + +gene_LF3['dst_multimode_all'] = gene_LF3.groupby('mutationinformation')['dm_om_numeric'].agg(multimode) +gene_LF3['dst_multimode_all'] + +# Fill using multimode ONLY where NA in dst_multimode column +gene_LF3['dst_multimode'] = gene_LF3['dst_multimode'].fillna(dm_om_multimode_LF3) +gene_LF3['dst_multimode'] + +#---------------------------------- +# Revised dst column: max of mode +#---------------------------------- +# Finally created a revised dst with the max from the multimode +# Now get the max from multimode +#gene_LF3['dst_mode'] = gene_LF3.groupby('mutationinformation')['dst_noNA'].max() # this somehow is not right! +#gene_LF3['dst_noNA'] = gene_LF3['dst_multimode'].apply(lambda x: np.nanmax(x)) +gene_LF3['dst_mode'] = gene_LF3['dst_multimode'].apply(lambda x: np.nanmax(x)) + +# sanity checks +gene_LF3['dst_noNA'].equals(gene_LF3['dst_mode']) + gene_LF3[drug].value_counts() +gene_LF3['dst_noNA'].value_counts() +gene_LF3['dst_mode'].value_counts() + +foo = gene_LF3[['Mut', 'position', 'dst', 'dst_multimode', 'dst_noNA', 'dst_mode']] +foo2 = foo.sort_values(['position', 'Mut']) print('\n------------------------------------------------------' - , '\nRevised counting: mutation_info i.e dm om column' - , '\n-----------------------------------------------------' + , '\nRevised counting: mutation_info i.e dm om column\n' + , '\n-----------------------------------------------------\n' , '\n----------------------------------' , '\nOriginal drug column count' - , '\n----------------------------------' + , '\n----------------------------------\n' , gene_LF3[drug].value_counts() - , '\nTotal samples [original]:', gene_LF3[drug].value_counts().sum() + , '\nTotal samples [original]', gene_LF3[drug].value_counts().sum() , '\n----------------------------------' - , '\nRevised drug column count' - , '\n----------------------------------' + , '\nRevised drug column count\n' + , '\n----------------------------------\n' , gene_LF3['dst_mode'].value_counts() - , '\nTotal samples [revised]:', gene_LF3['dst_mode'].value_counts().sum() + , '\nTotal samples [revised]', gene_LF3['dst_mode'].value_counts().sum() + + , '\n----------------------------------' + , '\nRevised drug column count: dst_noNA\n' + , '\n----------------------------------\n' + , gene_LF3['dst_noNA'].value_counts() ) +#%% Create revised mutation_info_column based on dst_mode +#--------------------------------------- +# Create revised mutation_info_column +#--------------------------------------- +# Will need to overwrite column 'mutation_info_labels', since downstream depends on it + +# Make a copy you before overwriting +#gene_LF3['mutation_info_labels_v1'] = gene_LF3['mutation_info'].map(dm_om_label_map) +gene_LF3['mutation_info_labels_v1'] = gene_LF3['mutation_info_labels'] +gene_LF3['mutation_info_labels_v1'].value_counts() == gene_LF3['mutation_info_labels'].value_counts() + +# Now overwrite +gene_LF3['mutation_info_labels_dst'] = gene_LF3['dst_mode'].map({1: 'DM', 0: 'OM'}) +gene_LF3['mutation_info_labels'] = gene_LF3['dst_mode'].map({1: 'DM', 0: 'OM'}) +if (gene_LF3['mutation_info_labels_dst'].value_counts() == gene_LF3['mutation_info_labels'].value_counts()): + print('\nRevised mutation_info colum created') + gene_LF3 = gene_LF3.drop(['mutation_info_labels_dst'], axis = 1) +else: + print('\nmutation info labels numbers mismatch' + , '\nPlease check section for mapping dst_mode to labels') + +gene_LF3['mutation_info_orig'].value_counts() +#gene_LF3['mutation_info_labels'].value_counts() +gene_LF3['mutation_info_labels_orig'] = gene_LF3['mutation_info_orig'].map(dm_om_label_map) +gene_LF3['mutation_info_labels_orig'].value_counts() + +# %% sanity check for the revised dst +gene_LF3[drug].value_counts() +gene_LF3[drug].value_counts().sum() +gene_LF3['mutation_info_labels_orig'].value_counts() + +gene_LF3['dst_mode'].value_counts() +gene_LF3['dst_mode'].value_counts().sum() + +# direct comparision +gene_LF3['dst'].value_counts() +gene_LF3['mutation_info_labels'].value_counts() +#%% \ No newline at end of file