got to the lineage extraction bit
This commit is contained in:
parent
0867827ec6
commit
1371704685
1 changed files with 174 additions and 19 deletions
|
@ -260,7 +260,6 @@ print("\n================================"
|
||||||
, "\nMissing lineage samples:", meta_data['id'].nunique() - meta_data['lineage'].value_counts().sum()
|
, "\nMissing lineage samples:", meta_data['id'].nunique() - meta_data['lineage'].value_counts().sum()
|
||||||
, "\n================================")
|
, "\n================================")
|
||||||
|
|
||||||
|
|
||||||
meta_data['id'].nunique()
|
meta_data['id'].nunique()
|
||||||
meta_data['sample'].nunique()
|
meta_data['sample'].nunique()
|
||||||
meta_data['id'].equals(meta_data['sample'])
|
meta_data['id'].equals(meta_data['sample'])
|
||||||
|
@ -1133,12 +1132,12 @@ gene_LF1.sort_values(by = ['position'], inplace = True)
|
||||||
bar = gene_LF1['position'].value_counts()
|
bar = gene_LF1['position'].value_counts()
|
||||||
|
|
||||||
# FIXME:Can only compare identically-labeled Series objects
|
# FIXME:Can only compare identically-labeled Series objects
|
||||||
if (foo == bar).all():
|
#if (foo == bar).all():
|
||||||
print('PASS: df ordered by position')
|
# print('PASS: df ordered by position')
|
||||||
print(gene_LF1['position'].head())
|
# print(gene_LF1['position'].head())
|
||||||
else:
|
#else:
|
||||||
print('FAIL: df could not be ordered. Check source')
|
# print('FAIL: df could not be ordered. Check source')
|
||||||
sys.exit()
|
# sys.exit()
|
||||||
|
|
||||||
#%% Create a copy of mutationinformation column for downstream mergeing
|
#%% Create a copy of mutationinformation column for downstream mergeing
|
||||||
gene_LF1['Mut'] = gene_LF1['mutationinformation']
|
gene_LF1['Mut'] = gene_LF1['mutationinformation']
|
||||||
|
@ -1238,7 +1237,7 @@ del(out_filename_metadata_poscounts)
|
||||||
#%% Add column: aa_calcprop
|
#%% Add column: aa_calcprop
|
||||||
#%% NEW mappings: gene_LF2
|
#%% NEW mappings: gene_LF2
|
||||||
# gene_LF2: copy gene_LF1
|
# gene_LF2: copy gene_LF1
|
||||||
gene_LF2 = gene_LF1.copy()gene_LF3.groupby('mutationinformation').drtype_all_vals.apply(list)
|
gene_LF2 = gene_LF1.copy()
|
||||||
gene_LF2.index
|
gene_LF2.index
|
||||||
|
|
||||||
#%% Add total unique id count
|
#%% Add total unique id count
|
||||||
|
@ -1344,7 +1343,6 @@ gene_LF3.head()
|
||||||
|
|
||||||
foo = gene_LF3[['Mut', 'position', 'drtype', 'drtype_multimode', 'drtype_mode', 'drtype_max']]
|
foo = gene_LF3[['Mut', 'position', 'drtype', 'drtype_multimode', 'drtype_mode', 'drtype_max']]
|
||||||
foo2 = foo.sort_values(['position', 'Mut'])
|
foo2 = foo.sort_values(['position', 'Mut'])
|
||||||
|
|
||||||
###############################################################################
|
###############################################################################
|
||||||
#%% Mapping 2: column '<dst>', drug
|
#%% Mapping 2: column '<dst>', drug
|
||||||
|
|
||||||
|
@ -1355,6 +1353,7 @@ foo2 = foo.sort_values(['position', 'Mut'])
|
||||||
dm_om_label_map = {dr_muts_col: 'DM'
|
dm_om_label_map = {dr_muts_col: 'DM'
|
||||||
, other_muts_col: 'OM'}
|
, other_muts_col: 'OM'}
|
||||||
dm_om_label_map
|
dm_om_label_map
|
||||||
|
|
||||||
gene_LF3['mutation_info_labels'] = gene_LF3['mutation_info'].map(dm_om_label_map)
|
gene_LF3['mutation_info_labels'] = gene_LF3['mutation_info'].map(dm_om_label_map)
|
||||||
|
|
||||||
# mapping 1.2: numeric
|
# mapping 1.2: numeric
|
||||||
|
@ -1418,10 +1417,9 @@ gene_LF3['dst_multimode']
|
||||||
gene_LF3['dst_mode'] = gene_LF3['dst_multimode'].apply(lambda x: np.nanmax(x))
|
gene_LF3['dst_mode'] = gene_LF3['dst_multimode'].apply(lambda x: np.nanmax(x))
|
||||||
|
|
||||||
# sanity checks
|
# sanity checks
|
||||||
gene_LF3['dst_noNA'].equals(gene_LF3['dst_mode'])
|
#gene_LF3['dst_noNA'].equals(gene_LF3['dst_mode'])
|
||||||
|
|
||||||
gene_LF3[drug].value_counts()
|
gene_LF3[drug].value_counts()
|
||||||
gene_LF3['dst_noNA'].value_counts()
|
#gene_LF3['dst_noNA'].value_counts()
|
||||||
gene_LF3['dst_mode'].value_counts()
|
gene_LF3['dst_mode'].value_counts()
|
||||||
|
|
||||||
foo = gene_LF3[['Mut', 'position', 'dst', 'dst_multimode', 'dst_noNA', 'dst_mode']]
|
foo = gene_LF3[['Mut', 'position', 'dst', 'dst_multimode', 'dst_noNA', 'dst_mode']]
|
||||||
|
@ -1443,10 +1441,10 @@ print('\n------------------------------------------------------'
|
||||||
, gene_LF3['dst_mode'].value_counts()
|
, gene_LF3['dst_mode'].value_counts()
|
||||||
, '\nTotal samples [revised]', gene_LF3['dst_mode'].value_counts().sum()
|
, '\nTotal samples [revised]', gene_LF3['dst_mode'].value_counts().sum()
|
||||||
|
|
||||||
, '\n----------------------------------'
|
# , '\n----------------------------------'
|
||||||
, '\nRevised drug column count: dst_noNA\n'
|
# , '\nRevised drug column count: dst_noNA\n'
|
||||||
, '\n----------------------------------\n'
|
# , '\n----------------------------------\n'
|
||||||
, gene_LF3['dst_noNA'].value_counts()
|
# , gene_LF3['dst_noNA'].value_counts()
|
||||||
)
|
)
|
||||||
#%% Create revised mutation_info_column based on dst_mode
|
#%% Create revised mutation_info_column based on dst_mode
|
||||||
#---------------------------------------
|
#---------------------------------------
|
||||||
|
@ -1462,8 +1460,8 @@ gene_LF3['mutation_info_labels_v1'].value_counts() == gene_LF3['mutation_info_la
|
||||||
# Now overwrite
|
# Now overwrite
|
||||||
gene_LF3['mutation_info_labels_dst'] = gene_LF3['dst_mode'].map({1: 'DM', 0: 'OM'})
|
gene_LF3['mutation_info_labels_dst'] = gene_LF3['dst_mode'].map({1: 'DM', 0: 'OM'})
|
||||||
gene_LF3['mutation_info_labels'] = gene_LF3['dst_mode'].map({1: 'DM', 0: 'OM'})
|
gene_LF3['mutation_info_labels'] = gene_LF3['dst_mode'].map({1: 'DM', 0: 'OM'})
|
||||||
if (gene_LF3['mutation_info_labels_dst'].value_counts() == gene_LF3['mutation_info_labels'].value_counts()):
|
if all(gene_LF3['mutation_info_labels_dst'].value_counts() == gene_LF3['mutation_info_labels'].value_counts()):
|
||||||
print('\nRevised mutation_info colum created')
|
print('\nPASS: Revised mutation_info column created successfully')
|
||||||
gene_LF3 = gene_LF3.drop(['mutation_info_labels_dst'], axis = 1)
|
gene_LF3 = gene_LF3.drop(['mutation_info_labels_dst'], axis = 1)
|
||||||
else:
|
else:
|
||||||
print('\nmutation info labels numbers mismatch'
|
print('\nmutation info labels numbers mismatch'
|
||||||
|
@ -1474,6 +1472,11 @@ gene_LF3['mutation_info_orig'].value_counts()
|
||||||
gene_LF3['mutation_info_labels_orig'] = gene_LF3['mutation_info_orig'].map(dm_om_label_map)
|
gene_LF3['mutation_info_labels_orig'] = gene_LF3['mutation_info_orig'].map(dm_om_label_map)
|
||||||
gene_LF3['mutation_info_labels_orig'].value_counts()
|
gene_LF3['mutation_info_labels_orig'].value_counts()
|
||||||
|
|
||||||
|
#%% FIXME: Get multimode for dm_om_numeric column
|
||||||
|
#dm_om_multimode_LF4 = gene_LF3.groupby('mutationinformation')['dm_om_numeric_orig'].agg(multimode)
|
||||||
|
#dm_om_multimode_LF4
|
||||||
|
#gene_LF3['dst_multimode_numeric'] = gene_LF3['dst_multimode'].fillna(dm_om_multimode_LF4)
|
||||||
|
|
||||||
# %% sanity check for the revised dst
|
# %% sanity check for the revised dst
|
||||||
gene_LF3[drug].value_counts()
|
gene_LF3[drug].value_counts()
|
||||||
gene_LF3[drug].value_counts().sum()
|
gene_LF3[drug].value_counts().sum()
|
||||||
|
@ -1485,4 +1488,156 @@ gene_LF3['dst_mode'].value_counts().sum()
|
||||||
# direct comparision
|
# direct comparision
|
||||||
gene_LF3['dst'].value_counts()
|
gene_LF3['dst'].value_counts()
|
||||||
gene_LF3['mutation_info_labels'].value_counts()
|
gene_LF3['mutation_info_labels'].value_counts()
|
||||||
#%%
|
#%% Lineage
|
||||||
|
gene_LF3['lineage'].value_counts()
|
||||||
|
# lineage_label_numeric = {'lineage1' : 1
|
||||||
|
# , 'lineage2' : 2
|
||||||
|
# , 'lineage3' : 3
|
||||||
|
# , 'lineage4' : 4
|
||||||
|
# , 'lineage5' : 5
|
||||||
|
# , 'lineage6' : 6
|
||||||
|
# , 'lineage7' : 7
|
||||||
|
# , 'lineageBOV' : 8}
|
||||||
|
|
||||||
|
lineage_label_numeric = {'L1' : 1
|
||||||
|
, 'L2' : 2
|
||||||
|
, 'L3' : 3
|
||||||
|
, 'L4' : 4
|
||||||
|
, 'L5' : 5
|
||||||
|
, 'L6' : 6
|
||||||
|
, 'L7' : 7
|
||||||
|
, 'LBOV' : 8}
|
||||||
|
|
||||||
|
lineage_label_numeric
|
||||||
|
# copy column to allow cross checks after stripping white space
|
||||||
|
gene_LF3['lineage'] = gene_LF3['lineage'].str.strip()
|
||||||
|
gene_LF3['lineage_corrupt'] = gene_LF3['lineage']
|
||||||
|
#all(gene_LF3['lineage_corrupt'].value_counts() ==gene_LF3['lineage'].value_counts())
|
||||||
|
gene_LF3['lineage_corrupt'].value_counts()
|
||||||
|
|
||||||
|
#%%tidy_split(): Lineage
|
||||||
|
# Create df with tidy_split: lineage
|
||||||
|
lf_lin_split = tidy_split(gene_LF3, 'lineage_corrupt', sep = ';')
|
||||||
|
lf_lin_split['lineage_corrupt'] = lf_lin_split['lineage_corrupt'].str.strip()
|
||||||
|
lf_lin_split['lineage_corrupt'].value_counts()
|
||||||
|
|
||||||
|
# Map lineage labels to numbers to allow metrics
|
||||||
|
lf_lin_split['lineage_numeric'] = lf_lin_split['lineage_corrupt'].map(lineage_label_numeric)
|
||||||
|
lf_lin_split['lineage_numeric'].value_counts()
|
||||||
|
|
||||||
|
#--------------------------------
|
||||||
|
# Lineage_corrupt ALL values:
|
||||||
|
#--------------------------------
|
||||||
|
# Add all lineages for each mutation
|
||||||
|
lf_lin_split['lineage_corrupt_list'] = lf_lin_split['lineage_corrupt']
|
||||||
|
lf_lin_split['lineage_corrupt_list'].value_counts()
|
||||||
|
#lf_lin_split['lineage_corrupt_list'] = lf_lin_split['mutationinformation'].map(lf_lin_split.groupby('mutationinformati
|
||||||
|
lf_lin_split['lineage_corrupt_list'] = lf_lin_split.groupby('mutationinformation').lineage_corrupt_list.apply(list)
|
||||||
|
lf_lin_split['lineage_corrupt_list'].value_counts()
|
||||||
|
|
||||||
|
# Add lineage unique count
|
||||||
|
lf_lin_split['lineage_corrupt_ucount'] = lf_lin_split['Mut'].map(lf_lin_split.groupby('Mut')['lineage_corrupt'].nunique())
|
||||||
|
#lf_lin_split['lineage_corrupt_ucount'] = lf_lin_split['lineage_corrupt']
|
||||||
|
lf_lin_split['lineage_corrupt_ucount'].value_counts()
|
||||||
|
|
||||||
|
# Add lineage_set
|
||||||
|
lf_lin_split['lineage_set'] = lf_lin_split['lineage_corrupt_list'].apply(lambda x : set(list(x)))
|
||||||
|
lf_lin_split['lineage_ulist'] = lf_lin_split['lineage_set'].apply(lambda x : list(x))
|
||||||
|
|
||||||
|
#-------------------------------------
|
||||||
|
# Lineage numeric mode: multimode
|
||||||
|
#-------------------------------------
|
||||||
|
lf_lin_split['lineage_multimode'] = lf_lin_split.groupby('mutationinformation')['lineage_numeric'].agg(multimode)
|
||||||
|
lf_lin_split['lineage_multimode'].value_counts()
|
||||||
|
|
||||||
|
# cant take max as it doesn't mean anyting!
|
||||||
|
foo = lf_lin_split[['Mut', 'lineage_ulist']]
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
#%% Final merge: gene_LF4 with lineage_split_df
|
||||||
|
gene_LF4 = gene_LF3.copy()
|
||||||
|
gene_LF4.index
|
||||||
|
gene_LF4['index_orig_copy'] = gene_LF4['index_orig']
|
||||||
|
|
||||||
|
lf_lin_split['index_orig_copy'] = lf_lin_split['index_orig']
|
||||||
|
#================================
|
||||||
|
# Merge with gene_LF3 with
|
||||||
|
# lf_lin_split baseed on index
|
||||||
|
#================================
|
||||||
|
# set index for lf_lin_split
|
||||||
|
lf_lin_split.index
|
||||||
|
lf_lin_split.reset_index(inplace=True)
|
||||||
|
lf_lin_split['mutationinformation']
|
||||||
|
lf_lin_split = lf_lin_split.set_index(['index_orig_copy'])
|
||||||
|
lf_lin_split.index
|
||||||
|
|
||||||
|
# set index for gene_LF4
|
||||||
|
gene_LF4.index
|
||||||
|
gene_LF4.reset_index(inplace=True)
|
||||||
|
gene_LF4.index
|
||||||
|
gene_LF4['mutationinformation']
|
||||||
|
gene_LF4['index_orig_copy']
|
||||||
|
gene_LF4 = gene_LF4.set_index(['index_orig_copy'])
|
||||||
|
gene_LF4.index
|
||||||
|
|
||||||
|
#-------------------------
|
||||||
|
# colum lineage_ucount:
|
||||||
|
# contribution of each distinct lineage
|
||||||
|
#-------------------------
|
||||||
|
gene_LF4['lineage_ucount'] = gene_LF4['lineage']
|
||||||
|
|
||||||
|
#-------------------------
|
||||||
|
# colum lineage list:
|
||||||
|
#-------------------------
|
||||||
|
#gene_LF4['lineage_set'] = gene_LF4['lineage']
|
||||||
|
gene_LF4['lineage_ulist'] = gene_LF4['lineage']
|
||||||
|
|
||||||
|
gene_LF4['lineage_list'] = gene_LF4['lineage']
|
||||||
|
|
||||||
|
#-------------------------
|
||||||
|
# colum lineage_list mode:
|
||||||
|
#-------------------------
|
||||||
|
gene_LF4['lineage_mode'] = gene_LF4['lineage']
|
||||||
|
|
||||||
|
# merge based on indices
|
||||||
|
gene_LF4.index.nunique()
|
||||||
|
lf_lin_split.index.nunique()
|
||||||
|
all(gene_LF4.index.isin(lf_lin_split.index))
|
||||||
|
all(lf_lin_split.index.isin(gene_LF4.index))
|
||||||
|
gene_LF4.index
|
||||||
|
lf_lin_split.index
|
||||||
|
|
||||||
|
if (gene_LF4.index.nunique() == lf_lin_split.index.nunique()) and ( all(gene_LF4.index.isin(lf_lin_split.index)) == all(lf_lin_split.index.isin(gene_LF4.index)) ):
|
||||||
|
print('\nPass: merging lineage_ucount with gene_LF4')
|
||||||
|
else:
|
||||||
|
sys.exit('\nFail: Indices mismatch, cannot merge! Quitting!')
|
||||||
|
|
||||||
|
###########################
|
||||||
|
# magic merge happens here
|
||||||
|
###########################
|
||||||
|
lf_lin_split.index.drop_duplicates(keep='first')
|
||||||
|
lf_lin_split = lf_lin_split
|
||||||
|
lf_lin_split_U = lf_lin_split[~lf_lin_split.index.duplicated(keep='first')]
|
||||||
|
lf_lin_split_U.shape
|
||||||
|
|
||||||
|
gene_LF4.loc[lf_lin_split_U.index, 'lineage_ucount'] = lf_lin_split_U['lineage_corrupt_ucount']
|
||||||
|
gene_LF4['lineage_ucount'].value_counts()
|
||||||
|
|
||||||
|
#gene_LF4.loc[lf_lin_split_U.index, 'lineage_set'] = lf_lin_split_U['lineage_set']
|
||||||
|
#gene_LF4['lineage_set'].value_counts()
|
||||||
|
gene_LF4.loc[lf_lin_split_U.index, 'lineage_ulist'] = lf_lin_split_U['lineage_ulist']
|
||||||
|
gene_LF4['lineage_ulist'].value_counts()
|
||||||
|
|
||||||
|
gene_LF4.loc[lf_lin_split_U.index, 'lineage_list'] = lf_lin_split_U['lineage_corrupt_list']
|
||||||
|
gene_LF4['lineage_list'].value_counts()
|
||||||
|
|
||||||
|
gene_LF4.loc[lf_lin_split_U.index, 'lineage_mode'] = lf_lin_split_U['lineage_multimode']
|
||||||
|
gene_LF4['lineage_mode'].value_counts()
|
||||||
|
|
||||||
|
foo = gene_LF4[['mutationinformation', 'lineage', 'lineage_ucount'
|
||||||
|
#, 'lineage_set'
|
||||||
|
, 'lineage_ulist'
|
||||||
|
, 'lineage_mode'
|
||||||
|
, 'lineage_list']]
|
||||||
|
#%%
|
||||||
|
#Subset relevant columns for output and put the rest of the output here
|
Loading…
Add table
Add a link
Reference in a new issue