got to the lineage extraction bit
This commit is contained in:
parent
0867827ec6
commit
1371704685
1 changed files with 174 additions and 19 deletions
|
@ -260,7 +260,6 @@ print("\n================================"
|
|||
, "\nMissing lineage samples:", meta_data['id'].nunique() - meta_data['lineage'].value_counts().sum()
|
||||
, "\n================================")
|
||||
|
||||
|
||||
meta_data['id'].nunique()
|
||||
meta_data['sample'].nunique()
|
||||
meta_data['id'].equals(meta_data['sample'])
|
||||
|
@ -1133,12 +1132,12 @@ gene_LF1.sort_values(by = ['position'], inplace = True)
|
|||
bar = gene_LF1['position'].value_counts()
|
||||
|
||||
# FIXME:Can only compare identically-labeled Series objects
|
||||
if (foo == bar).all():
|
||||
print('PASS: df ordered by position')
|
||||
print(gene_LF1['position'].head())
|
||||
else:
|
||||
print('FAIL: df could not be ordered. Check source')
|
||||
sys.exit()
|
||||
#if (foo == bar).all():
|
||||
# print('PASS: df ordered by position')
|
||||
# print(gene_LF1['position'].head())
|
||||
#else:
|
||||
# print('FAIL: df could not be ordered. Check source')
|
||||
# sys.exit()
|
||||
|
||||
#%% Create a copy of mutationinformation column for downstream mergeing
|
||||
gene_LF1['Mut'] = gene_LF1['mutationinformation']
|
||||
|
@ -1238,7 +1237,7 @@ del(out_filename_metadata_poscounts)
|
|||
#%% Add column: aa_calcprop
|
||||
#%% NEW mappings: gene_LF2
|
||||
# gene_LF2: copy gene_LF1
|
||||
gene_LF2 = gene_LF1.copy()gene_LF3.groupby('mutationinformation').drtype_all_vals.apply(list)
|
||||
gene_LF2 = gene_LF1.copy()
|
||||
gene_LF2.index
|
||||
|
||||
#%% Add total unique id count
|
||||
|
@ -1344,7 +1343,6 @@ gene_LF3.head()
|
|||
|
||||
foo = gene_LF3[['Mut', 'position', 'drtype', 'drtype_multimode', 'drtype_mode', 'drtype_max']]
|
||||
foo2 = foo.sort_values(['position', 'Mut'])
|
||||
|
||||
###############################################################################
|
||||
#%% Mapping 2: column '<dst>', drug
|
||||
|
||||
|
@ -1355,6 +1353,7 @@ foo2 = foo.sort_values(['position', 'Mut'])
|
|||
dm_om_label_map = {dr_muts_col: 'DM'
|
||||
, other_muts_col: 'OM'}
|
||||
dm_om_label_map
|
||||
|
||||
gene_LF3['mutation_info_labels'] = gene_LF3['mutation_info'].map(dm_om_label_map)
|
||||
|
||||
# mapping 1.2: numeric
|
||||
|
@ -1418,10 +1417,9 @@ gene_LF3['dst_multimode']
|
|||
gene_LF3['dst_mode'] = gene_LF3['dst_multimode'].apply(lambda x: np.nanmax(x))
|
||||
|
||||
# sanity checks
|
||||
gene_LF3['dst_noNA'].equals(gene_LF3['dst_mode'])
|
||||
|
||||
#gene_LF3['dst_noNA'].equals(gene_LF3['dst_mode'])
|
||||
gene_LF3[drug].value_counts()
|
||||
gene_LF3['dst_noNA'].value_counts()
|
||||
#gene_LF3['dst_noNA'].value_counts()
|
||||
gene_LF3['dst_mode'].value_counts()
|
||||
|
||||
foo = gene_LF3[['Mut', 'position', 'dst', 'dst_multimode', 'dst_noNA', 'dst_mode']]
|
||||
|
@ -1443,10 +1441,10 @@ print('\n------------------------------------------------------'
|
|||
, gene_LF3['dst_mode'].value_counts()
|
||||
, '\nTotal samples [revised]', gene_LF3['dst_mode'].value_counts().sum()
|
||||
|
||||
, '\n----------------------------------'
|
||||
, '\nRevised drug column count: dst_noNA\n'
|
||||
, '\n----------------------------------\n'
|
||||
, gene_LF3['dst_noNA'].value_counts()
|
||||
# , '\n----------------------------------'
|
||||
# , '\nRevised drug column count: dst_noNA\n'
|
||||
# , '\n----------------------------------\n'
|
||||
# , gene_LF3['dst_noNA'].value_counts()
|
||||
)
|
||||
#%% Create revised mutation_info_column based on dst_mode
|
||||
#---------------------------------------
|
||||
|
@ -1462,8 +1460,8 @@ gene_LF3['mutation_info_labels_v1'].value_counts() == gene_LF3['mutation_info_la
|
|||
# Now overwrite
|
||||
gene_LF3['mutation_info_labels_dst'] = gene_LF3['dst_mode'].map({1: 'DM', 0: 'OM'})
|
||||
gene_LF3['mutation_info_labels'] = gene_LF3['dst_mode'].map({1: 'DM', 0: 'OM'})
|
||||
if (gene_LF3['mutation_info_labels_dst'].value_counts() == gene_LF3['mutation_info_labels'].value_counts()):
|
||||
print('\nRevised mutation_info colum created')
|
||||
if all(gene_LF3['mutation_info_labels_dst'].value_counts() == gene_LF3['mutation_info_labels'].value_counts()):
|
||||
print('\nPASS: Revised mutation_info column created successfully')
|
||||
gene_LF3 = gene_LF3.drop(['mutation_info_labels_dst'], axis = 1)
|
||||
else:
|
||||
print('\nmutation info labels numbers mismatch'
|
||||
|
@ -1474,6 +1472,11 @@ gene_LF3['mutation_info_orig'].value_counts()
|
|||
gene_LF3['mutation_info_labels_orig'] = gene_LF3['mutation_info_orig'].map(dm_om_label_map)
|
||||
gene_LF3['mutation_info_labels_orig'].value_counts()
|
||||
|
||||
#%% FIXME: Get multimode for dm_om_numeric column
|
||||
#dm_om_multimode_LF4 = gene_LF3.groupby('mutationinformation')['dm_om_numeric_orig'].agg(multimode)
|
||||
#dm_om_multimode_LF4
|
||||
#gene_LF3['dst_multimode_numeric'] = gene_LF3['dst_multimode'].fillna(dm_om_multimode_LF4)
|
||||
|
||||
# %% sanity check for the revised dst
|
||||
gene_LF3[drug].value_counts()
|
||||
gene_LF3[drug].value_counts().sum()
|
||||
|
@ -1485,4 +1488,156 @@ gene_LF3['dst_mode'].value_counts().sum()
|
|||
# direct comparision
|
||||
gene_LF3['dst'].value_counts()
|
||||
gene_LF3['mutation_info_labels'].value_counts()
|
||||
#%%
|
||||
#%% Lineage
|
||||
gene_LF3['lineage'].value_counts()
|
||||
# lineage_label_numeric = {'lineage1' : 1
|
||||
# , 'lineage2' : 2
|
||||
# , 'lineage3' : 3
|
||||
# , 'lineage4' : 4
|
||||
# , 'lineage5' : 5
|
||||
# , 'lineage6' : 6
|
||||
# , 'lineage7' : 7
|
||||
# , 'lineageBOV' : 8}
|
||||
|
||||
lineage_label_numeric = {'L1' : 1
|
||||
, 'L2' : 2
|
||||
, 'L3' : 3
|
||||
, 'L4' : 4
|
||||
, 'L5' : 5
|
||||
, 'L6' : 6
|
||||
, 'L7' : 7
|
||||
, 'LBOV' : 8}
|
||||
|
||||
lineage_label_numeric
|
||||
# copy column to allow cross checks after stripping white space
|
||||
gene_LF3['lineage'] = gene_LF3['lineage'].str.strip()
|
||||
gene_LF3['lineage_corrupt'] = gene_LF3['lineage']
|
||||
#all(gene_LF3['lineage_corrupt'].value_counts() ==gene_LF3['lineage'].value_counts())
|
||||
gene_LF3['lineage_corrupt'].value_counts()
|
||||
|
||||
#%%tidy_split(): Lineage
|
||||
# Create df with tidy_split: lineage
|
||||
lf_lin_split = tidy_split(gene_LF3, 'lineage_corrupt', sep = ';')
|
||||
lf_lin_split['lineage_corrupt'] = lf_lin_split['lineage_corrupt'].str.strip()
|
||||
lf_lin_split['lineage_corrupt'].value_counts()
|
||||
|
||||
# Map lineage labels to numbers to allow metrics
|
||||
lf_lin_split['lineage_numeric'] = lf_lin_split['lineage_corrupt'].map(lineage_label_numeric)
|
||||
lf_lin_split['lineage_numeric'].value_counts()
|
||||
|
||||
#--------------------------------
|
||||
# Lineage_corrupt ALL values:
|
||||
#--------------------------------
|
||||
# Add all lineages for each mutation
|
||||
lf_lin_split['lineage_corrupt_list'] = lf_lin_split['lineage_corrupt']
|
||||
lf_lin_split['lineage_corrupt_list'].value_counts()
|
||||
#lf_lin_split['lineage_corrupt_list'] = lf_lin_split['mutationinformation'].map(lf_lin_split.groupby('mutationinformati
|
||||
lf_lin_split['lineage_corrupt_list'] = lf_lin_split.groupby('mutationinformation').lineage_corrupt_list.apply(list)
|
||||
lf_lin_split['lineage_corrupt_list'].value_counts()
|
||||
|
||||
# Add lineage unique count
|
||||
lf_lin_split['lineage_corrupt_ucount'] = lf_lin_split['Mut'].map(lf_lin_split.groupby('Mut')['lineage_corrupt'].nunique())
|
||||
#lf_lin_split['lineage_corrupt_ucount'] = lf_lin_split['lineage_corrupt']
|
||||
lf_lin_split['lineage_corrupt_ucount'].value_counts()
|
||||
|
||||
# Add lineage_set
|
||||
lf_lin_split['lineage_set'] = lf_lin_split['lineage_corrupt_list'].apply(lambda x : set(list(x)))
|
||||
lf_lin_split['lineage_ulist'] = lf_lin_split['lineage_set'].apply(lambda x : list(x))
|
||||
|
||||
#-------------------------------------
|
||||
# Lineage numeric mode: multimode
|
||||
#-------------------------------------
|
||||
lf_lin_split['lineage_multimode'] = lf_lin_split.groupby('mutationinformation')['lineage_numeric'].agg(multimode)
|
||||
lf_lin_split['lineage_multimode'].value_counts()
|
||||
|
||||
# cant take max as it doesn't mean anyting!
|
||||
foo = lf_lin_split[['Mut', 'lineage_ulist']]
|
||||
|
||||
###############################################################################
|
||||
#%% Final merge: gene_LF4 with lineage_split_df
|
||||
gene_LF4 = gene_LF3.copy()
|
||||
gene_LF4.index
|
||||
gene_LF4['index_orig_copy'] = gene_LF4['index_orig']
|
||||
|
||||
lf_lin_split['index_orig_copy'] = lf_lin_split['index_orig']
|
||||
#================================
|
||||
# Merge with gene_LF3 with
|
||||
# lf_lin_split baseed on index
|
||||
#================================
|
||||
# set index for lf_lin_split
|
||||
lf_lin_split.index
|
||||
lf_lin_split.reset_index(inplace=True)
|
||||
lf_lin_split['mutationinformation']
|
||||
lf_lin_split = lf_lin_split.set_index(['index_orig_copy'])
|
||||
lf_lin_split.index
|
||||
|
||||
# set index for gene_LF4
|
||||
gene_LF4.index
|
||||
gene_LF4.reset_index(inplace=True)
|
||||
gene_LF4.index
|
||||
gene_LF4['mutationinformation']
|
||||
gene_LF4['index_orig_copy']
|
||||
gene_LF4 = gene_LF4.set_index(['index_orig_copy'])
|
||||
gene_LF4.index
|
||||
|
||||
#-------------------------
|
||||
# colum lineage_ucount:
|
||||
# contribution of each distinct lineage
|
||||
#-------------------------
|
||||
gene_LF4['lineage_ucount'] = gene_LF4['lineage']
|
||||
|
||||
#-------------------------
|
||||
# colum lineage list:
|
||||
#-------------------------
|
||||
#gene_LF4['lineage_set'] = gene_LF4['lineage']
|
||||
gene_LF4['lineage_ulist'] = gene_LF4['lineage']
|
||||
|
||||
gene_LF4['lineage_list'] = gene_LF4['lineage']
|
||||
|
||||
#-------------------------
|
||||
# colum lineage_list mode:
|
||||
#-------------------------
|
||||
gene_LF4['lineage_mode'] = gene_LF4['lineage']
|
||||
|
||||
# merge based on indices
|
||||
gene_LF4.index.nunique()
|
||||
lf_lin_split.index.nunique()
|
||||
all(gene_LF4.index.isin(lf_lin_split.index))
|
||||
all(lf_lin_split.index.isin(gene_LF4.index))
|
||||
gene_LF4.index
|
||||
lf_lin_split.index
|
||||
|
||||
if (gene_LF4.index.nunique() == lf_lin_split.index.nunique()) and ( all(gene_LF4.index.isin(lf_lin_split.index)) == all(lf_lin_split.index.isin(gene_LF4.index)) ):
|
||||
print('\nPass: merging lineage_ucount with gene_LF4')
|
||||
else:
|
||||
sys.exit('\nFail: Indices mismatch, cannot merge! Quitting!')
|
||||
|
||||
###########################
|
||||
# magic merge happens here
|
||||
###########################
|
||||
lf_lin_split.index.drop_duplicates(keep='first')
|
||||
lf_lin_split = lf_lin_split
|
||||
lf_lin_split_U = lf_lin_split[~lf_lin_split.index.duplicated(keep='first')]
|
||||
lf_lin_split_U.shape
|
||||
|
||||
gene_LF4.loc[lf_lin_split_U.index, 'lineage_ucount'] = lf_lin_split_U['lineage_corrupt_ucount']
|
||||
gene_LF4['lineage_ucount'].value_counts()
|
||||
|
||||
#gene_LF4.loc[lf_lin_split_U.index, 'lineage_set'] = lf_lin_split_U['lineage_set']
|
||||
#gene_LF4['lineage_set'].value_counts()
|
||||
gene_LF4.loc[lf_lin_split_U.index, 'lineage_ulist'] = lf_lin_split_U['lineage_ulist']
|
||||
gene_LF4['lineage_ulist'].value_counts()
|
||||
|
||||
gene_LF4.loc[lf_lin_split_U.index, 'lineage_list'] = lf_lin_split_U['lineage_corrupt_list']
|
||||
gene_LF4['lineage_list'].value_counts()
|
||||
|
||||
gene_LF4.loc[lf_lin_split_U.index, 'lineage_mode'] = lf_lin_split_U['lineage_multimode']
|
||||
gene_LF4['lineage_mode'].value_counts()
|
||||
|
||||
foo = gene_LF4[['mutationinformation', 'lineage', 'lineage_ucount'
|
||||
#, 'lineage_set'
|
||||
, 'lineage_ulist'
|
||||
, 'lineage_mode'
|
||||
, 'lineage_list']]
|
||||
#%%
|
||||
#Subset relevant columns for output and put the rest of the output here
|
Loading…
Add table
Add a link
Reference in a new issue