From ae3a5500c93c5c660e31c8f9cb1eec724a8fefe9 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Thu, 14 Apr 2022 19:27:21 +0100 Subject: [PATCH] mostly done, now adding lineage magicry --- scripts/data_extraction_v2.py | 344 +++++++++++++++++++++++++++++----- 1 file changed, 302 insertions(+), 42 deletions(-) diff --git a/scripts/data_extraction_v2.py b/scripts/data_extraction_v2.py index e3e404f..70aeae3 100644 --- a/scripts/data_extraction_v2.py +++ b/scripts/data_extraction_v2.py @@ -1440,79 +1440,339 @@ else: # clear variables del(k, v, wt, mut, lookup_dict) +#%% NEW mappings: gene_LF2 +# gene_LF2: copy gene_LF1 +gene_LF2 = gene_LF1.copy() -#%% NEW TODO: map mutationinformation +#%% AF for gene +gene_LF2['id'].nunique() +gene_LF2['mutationinformation'].nunique() +total_id_ucount = gene_LF2['id'].nunique() +total_id_ucount +total_id_ucount2 = gene_LF2['sample'].nunique() +total_id_ucount2 -#%% NEW: mappings +if total_id_ucount == total_id_ucount2: + print('\nPASS: sample and id unique counts match') +else: + print('\nFAIL: sample and id unique counts DO NOT match!' + , '\nGWAS worry!?') + +# Add all sample ids in a list for sanity checks +#gene_LF2['id_list'] = gene_LF2['mutationinformation'].map(gene_LF2.groupby('mutationinformation')['id'].apply(list)) + +#======================================= +# Add column: total unique id/sample count +#======================================= +gene_LF2['total_id_ucount'] = total_id_ucount + +#========================================== +# Add column: mutation count in all samples +#========================================== +gene_LF2['mut_id_ucount'] = gene_LF2['mutationinformation'].map(gene_LF2.groupby('mutationinformation')['id'].nunique()) +gene_LF2['mut_id_ucount'] + +#================= +# Add column: AF +#================= +gene_LF2['maf'] = gene_LF2['mut_id_ucount']/gene_LF2['total_id_ucount'] +gene_LF2['maf'].head() + +#%% Mapping 1: column '', mutation_info #======================= # column name: #======================= -# mapping 1: labels +# mapping 1.1: labels dm_om_label_map = {dr_muts_col: 'DM' , other_muts_col: 'OM'} +dm_om_label_map +gene_LF2['mutation_info_labels'] = gene_LF2['mutation_info'].map(dm_om_label_map) -gene_LF0['mutation_info_labels'] = gene_LF0['mutation_info'].map(dm_om_label_map) - -# mapping 2: numeric +# mapping 1.2: numeric dm_om_num_map = {dr_muts_col: 1 , other_muts_col: 0} -gene_LF0['dm_om_numeric'] = gene_LF0['mutation_info'].map(dm_om_num_map) +gene_LF2['dm_om_numeric'] = gene_LF2['mutation_info'].map(dm_om_num_map) +gene_LF2['dm_om_numeric_orig'] = gene_LF2['mutation_info_orig'].map(dm_om_num_map) -gene_LF0['mutation_info'].value_counts() -gene_LF0['mutation_info_labels'].value_counts() -gene_LF0['dm_om_numeric'].value_counts() +gene_LF2['mutation_info'].value_counts() +gene_LF2['mutation_info_labels'].value_counts() +gene_LF2['dm_om_numeric'].value_counts() +gene_LF2['dm_om_numeric_orig'].value_counts() +#%% Mapping 2: column '', mutation #============================ # column name: #============================ -gene_LF0['drtype'].value_counts() +gene_LF2['drtype'].value_counts() -# mapping: numeric +# mapping 2.1: numeric drtype_map = {'XDR': 5 , 'Pre-XDR': 4 , 'MDR': 3 , 'Pre-MDR': 2 , 'Other': 1 , 'Sensitive': 0} -gene_LF0['drtype_numeric'] = gene_LF0['drtype'].map(drtype_map) -gene_LF0['drtype'].value_counts() -gene_LF0['drtype_numeric'].value_counts() +gene_LF2['drtype_numeric'] = gene_LF2['drtype'].map(drtype_map) -#%% multimode -# COPY dst column -gene_LF0['dst'] = gene_LF0[drug] # to allow cross checking -gene_LF0['dst_multimode'] = gene_LF0[drug] +gene_LF2['drtype'].value_counts() +gene_LF2['drtype_numeric'].value_counts() + +#%% Recalculations: Multimode +# copy dst column +gene_LF2['dst'] = gene_LF2[drug] # to allow cross checking +gene_LF2['dst'].equals(gene_LF2[drug]) + +gene_LF2['dst_multimode'] = gene_LF2[drug] # sanity check -gene_LF0[drug].value_counts() -gene_LF0['dst_multimode'].value_counts() -gene_LF0[drug].isna().sum() +gene_LF2[drug].value_counts() +gene_LF2['dst_multimode'].value_counts() -if gene_LF0[drug].value_counts().sum()+gene_LF0[drug].isna().sum() == len(gene_LF0): - print('\nPASS:', 'Numbers match') -else: - print('\nFAIL:', 'Numbers mismatch') +gene_LF2[drug].isnull().sum() +gene_LF2['dst_multimode'].isnull().sum() + +gene_LF2['mutationinformation'].value_counts() +gene_LF2[drug].isnull().groupby(gene_LF2['mutationinformation']).sum() -gene_LF0['mutation'].value_counts() -#data.C.isnull().groupby([df['A'],df['B']]).sum().astype(int).reset_index(name='count') -gene_LF0[drug].isnull().groupby(gene_LF0['mutation']).sum() - # GOAL is to populate na in the dst column from the count of the dm_om_numeric column -gene_LF0['dst_multimode'].isnull().groupby(gene_LF0['mutationinformation']).sum() - -# COPY mutationinformation for sanity check -#data['mutation'] = data['mutationinformation'] -gene_LF0['mutationinformation'] = gene_LF0['mutation'] - +gene_LF2['dst_multimode'].isnull().groupby(gene_LF2['mutationinformation']).sum() +gene_LF2.index +gene_LF2['index_orig'] = gene_LF2.index # need it for setting back later +#%% FIXME: Add sanity check +#gene_LF2['index_orig'].equals(gene_LF2.index) +#%% Set index: 'mutationinformation' for adding multimode +gene_LF3 = gene_LF2.set_index(['mutationinformation']) +gene_LF3.index +gene_LF3['dst_multimode'].value_counts() +gene_LF3['dst_multimode'].value_counts().sum() +#%% Multimode: dst +# For each mutation, generate the revised dst which is the mode of dm_om_numeric +#============================= +# Recalculation: Revised dst +# max(multimode) +#============================= # Get multimode for dm_om_numeric column -dm_om_multimode = gene_LF0.groupby('mutationinformation')['dm_om_numeric'].agg(multimode) -dm_om_multimode +dm_om_multimode_LF3 = gene_LF3.groupby('mutationinformation')['dm_om_numeric_orig'].agg(multimode) +dm_om_multimode_LF3 -gene_LF0['dst_multimode'] = gene_LF0['dst_multimode'].fillna(dm_om_multimode) -gene_LF0['dst_noNA'] = gene_LF0['dst_multimode'].apply(lambda x: np.nanmax(x)) -print(gene_LF0) +# Fill using multimode ONLY where NA in dst_multimode column +gene_LF3['dst_multimode'] = gene_LF3['dst_multimode'].fillna(dm_om_multimode_LF3) +# Now get the max from multimode +gene_LF3['dst_noNA'] = gene_LF3['dst_multimode'].apply(lambda x: np.nanmax(x)) +print(gene_LF3) + +#---------------------------- +# Revised dst column: Max +#---------------------------- # Finally created a revised dst with the max from the multimode -gene_LF0['dst_mode'] = gene_LF0.groupby('mutationinformation')['dst_noNA'].max() +gene_LF3['dst_mode'] = gene_LF3.groupby('mutationinformation')['dst_noNA'].max() + +#%% Multimode: drtype +#============================= +# Recalculation: Revised drtype +# max(multimode) +#============================= +#-------------------------------- +# drtype: ALL values: +# numeric and names in an array +#-------------------------------- +gene_LF3['drtype_all_vals'] = gene_LF3['drtype_numeric'] +gene_LF3['drtype_all_names'] = gene_LF3['drtype'] + +gene_LF3['drtype_all_vals'] = gene_LF3.groupby('mutationinformation').drtype_all_vals.apply(list) +gene_LF3['drtype_all_names'] = gene_LF3.groupby('mutationinformation').drtype_all_names.apply(list) + +#--------------------------------- +# Revised drtype: max(Multimode) +#-------------------------------- +gene_LF3['drtype_multimode'] = gene_LF3.groupby(['mutationinformation'])['drtype_numeric'].agg(multimode) +gene_LF3['drtype_multimode'] + +# Now get the max from multimode +gene_LF3['drtype_mode'] = gene_LF3['drtype_multimode'].apply(lambda x: np.nanmax(x)) +gene_LF3.head() + +#---------------------- +# Revised drtype: Max +#---------------------- +gene_LF3.head() +gene_LF3['drtype_max'] = gene_LF3.groupby(['mutationinformation'])['drtype_numeric'].max() +gene_LF3.head() + +#%% Reset index: original indices +#gene_LF3 = gene_LF3.reset_index() +gene_LF3.index +gene_LF3['mutationinformation'] = gene_LF3.index +gene_LF3 = gene_LF3.set_index(['index_orig']) + +gene_LF3[['mutationinformation']] +gene_LF3.index +#%% Revised counts +gene_LF3['dst_mode'].value_counts() +gene_LF3[drug].value_counts() + +print('\n------------------------------------------------------' + , '\nRevised counting: mutation_info i.e dm om column' + , '\n-----------------------------------------------------' + + , '\n----------------------------------' + , '\nOriginal drug column count' + , '\n----------------------------------' + , gene_LF3[drug].value_counts() + , '\nTotal samples [original]:', gene_LF3[drug].value_counts().sum() + + , '\n----------------------------------' + , '\nRevised drug column count' + , '\n----------------------------------' + , gene_LF3['dst_mode'].value_counts() + , '\nTotal samples [revised]:', gene_LF3['dst_mode'].value_counts().sum() + ) +#%% FIXME: CHECK THIS, run this with mutation_info_REV +#--------------------------------------- +# Create revised mutation_info_column +#--------------------------------------- +# Note this is overriding, since downstream depends on it +# make a copy you if you need to keep that +# create a copy before running this + +gene_LF3['mutation_info_labels_orig'] = gene_LF3['mutation_info_labels'] + +gene_LF3['mutation_info_labels'] = gene_LF3['dst_mode'].map({1: 'DM' + , 0: 'OM'}) + +gene_LF3['mutation_info_labels_orig'].value_counts() +gene_LF3['mutation_info_labels_orig'].value_counts().sum() + +gene_LF3['mutation_info_labels'].value_counts() +gene_LF3['mutation_info_labels'].value_counts().sum() +#%% TEST muts +test_muts = ['G132A', 'V180F', 'G108R', 'A102P'] +test_muts = ['L4S', 'L4W', 'A102P'] + +gene_LF3 = gene_LF2[gene_LF2.loc[:,'mutationinformation'].isin(test_muts)] +gene_LF4 = gene_LF3[['id', 'mutationinformation', drug + , 'mutation_info_orig', 'dm_om_numeric_orig', 'dst', 'dst_multimode']] + +# Reset index as it allows the groupby expression to directly map +gene_LF4.index +gene_LF4= gene_LF4.set_index(['mutationinformation']) +gene_LF4.index + +# Get multimode for dm_om_numeric column +dm_om_multimode_LF4 = gene_LF4.groupby('mutationinformation')['dm_om_numeric_orig'].agg(multimode) +dm_om_multimode_LF4 + +gene_LF4['dst_multimode'] = gene_LF4['dst_multimode'].fillna(dm_om_multimode_LF4) + +# LINEAGE +foo = gene_LF2.copy() +foo = foo[foo.loc[:,'mutationinformation'].isin(test_muts)] +foo = foo[['id', 'mutationinformation','lineage' ]] +foo['MUT'] = foo['mutationinformation'] +foo['lineage'] = foo['lineage'].str.strip() +foo['lineage_corrupt'] = foo['lineage'] + +#foo['lineage_ucount'] = foo['mutationinformation'].map(foo.groupby('mutationinformation')['lineage'].nunique())# seems wrong! + +foo2 = tidy_split(foo, 'lineage_corrupt', sep = ';') +foo2['lineage_corrupt'] = foo2['lineage_corrupt'].str.strip() +foo2['lineage_corrupt_list'] = foo2['mutationinformation'].map(foo2.groupby('mutationinformation')['lineage_corrupt'].apply(list)) +foo2['lineage_corrupt_ucount'] = foo2['mutationinformation'].map(foo2.groupby('mutationinformation')['lineage_corrupt'].nunique()) + +foo2.groupby('mutationinformation')['lineage_corrupt'].value_counts() +foo2['lineage_corrupt'].value_counts() +foo2['lineage_corrupt_ucount'] +foo2.index +foo2 = foo2.set_index(['mutationinformation']) + + +# now merge +foo.index +foo.index.nunique() +foo2.index.nunique() +foo_copy = foo.copy() + +foo_copy['lineage_ucount'] = foo_copy['lineage'] +foo_copy.loc[foo2.index, 'lineage_ucount'] = foo2['lineage_corrupt_ucount'] + +#%%FIXME: do regex for lineage for meta data else the ; messes it up +#-------------------------- +# lineage multimode mode +#-------------------------- +lineage_label_map = {'lineage1' : 'L1' + , 'lineage2' : 'L2' + , 'lineage3' : 'L3' + , 'lineage4' : 'L4' + , 'lineage5' : 'L5' + , 'lineage6' : 'L6' + , 'lineage7' : 'L7' + , 'lineageBOV' : 'LBOV'} + +foo['lineage'].value_counts() + +lineage_label_numeric = {'lineage1' : 1 + , 'lineage2' : 2 + , 'lineage3' : 3 + , 'lineage4' : 4 + , 'lineage5' : 5 + , 'lineage6' : 6 + , 'lineage7' : 7 + , 'lineageBOV' : 8} + + +lineage_label_numeric +foo2['lineage_corrupt'].value_counts() +foo2['lineage_numeric'] = foo2['lineage_corrupt'].map(lineage_label_numeric) +foo2['lineage_numeric'].value_counts() + +foo2['lineage_numeric_list'] = foo2['mutationinformation'].map(foo2.groupby('mutationinformation')['lineage_numeric'].apply(list)) +foo2['lineage_numeric_list'] + +foo2['lineage_multimode'] = foo2.groupby(['mutationinformation'])['lineage_numeric'].agg(multimode) +c2 = foo2[foo2.loc[:, 'MUT'].isin(['A102P'])] +c2['lineage_numeric'].value_counts() + + +#%% Lineage counts (including the ones containing multiple entries) + +# Get information about how many distinct lineages each mutation comes from +gene_LF3['lineage'].value_counts() +gene_LF3['lineage'] = gene_LF3['lineage'].str.strip() +gene_LF3['lineage'].value_counts() + +# Create a column: lineage_corrupt +gene_LF3['lineage_corrupt'] = gene_LF3['lineage'] + +# Create df with tidy_split: lineage +lf_lin_split = tidy_split(gene_LF3, 'lineage_corrupt', sep = ';') +lf_lin_split['lineage_corrupt'] = lf_lin_split['lineage_corrupt'].str.strip() +lf_lin_split['lineage_corrupt_list'] = lf_lin_split['mutationinformation'].map(lf_lin_split.groupby('mutationinformation')['lineage_corrupt'].apply(list)) +lf_lin_split['lineage_corrupt_ucount'] = lf_lin_split['mutationinformation'].map(lf_lin_split.groupby('mutationinformation')['lineage_corrupt'].nunique()) + +#---------------------------------- +# Merge with gene_LF3 with +# lf_lin_split +#----------------------------------- +gene_LF3['lineage_ucount'] = gene_LF3['lineage'] + +# quick checks +gene_LF3['lineage_ucount'].equals(gene_LF3['lineage']) + +# merge based on indices +gene_LF3.index.nunique() +lf_lin_split.index.nunique() +all(gene_LF3.index.isin(lf_lin_split.index)) +all(lf_lin_split.index.isin(gene_LF3.index)) + +# magic merge happens here +gene_LF3.loc[lf_lin_split.index, 'lineage_ucount'] = lf_lin_split['lineage_corrupt_ucount'] + +#%% sanity checks +check1 = gene_LF3[['mutationinformation', 'lineage', 'lineage_ucount']] +check2 = check1[check1.loc[:, 'mutationinformation'].isin(['H57D'])] +check2.value_counts() + +#%% \ No newline at end of file