got to the lineage extraction bit

This commit is contained in:
Tanushree Tunstall 2022-04-25 18:37:01 +01:00
parent 0867827ec6
commit 1371704685

View file

@ -260,7 +260,6 @@ print("\n================================"
, "\nMissing lineage samples:", meta_data['id'].nunique() - meta_data['lineage'].value_counts().sum()
, "\n================================")
meta_data['id'].nunique()
meta_data['sample'].nunique()
meta_data['id'].equals(meta_data['sample'])
@ -1133,12 +1132,12 @@ gene_LF1.sort_values(by = ['position'], inplace = True)
bar = gene_LF1['position'].value_counts()
# FIXME:Can only compare identically-labeled Series objects
if (foo == bar).all():
print('PASS: df ordered by position')
print(gene_LF1['position'].head())
else:
print('FAIL: df could not be ordered. Check source')
sys.exit()
#if (foo == bar).all():
# print('PASS: df ordered by position')
# print(gene_LF1['position'].head())
#else:
# print('FAIL: df could not be ordered. Check source')
# sys.exit()
#%% Create a copy of mutationinformation column for downstream mergeing
gene_LF1['Mut'] = gene_LF1['mutationinformation']
@ -1238,7 +1237,7 @@ del(out_filename_metadata_poscounts)
#%% Add column: aa_calcprop
#%% NEW mappings: gene_LF2
# gene_LF2: copy gene_LF1
gene_LF2 = gene_LF1.copy()gene_LF3.groupby('mutationinformation').drtype_all_vals.apply(list)
gene_LF2 = gene_LF1.copy()
gene_LF2.index
#%% Add total unique id count
@ -1344,7 +1343,6 @@ gene_LF3.head()
foo = gene_LF3[['Mut', 'position', 'drtype', 'drtype_multimode', 'drtype_mode', 'drtype_max']]
foo2 = foo.sort_values(['position', 'Mut'])
###############################################################################
#%% Mapping 2: column '<dst>', drug
@ -1355,6 +1353,7 @@ foo2 = foo.sort_values(['position', 'Mut'])
dm_om_label_map = {dr_muts_col: 'DM'
, other_muts_col: 'OM'}
dm_om_label_map
gene_LF3['mutation_info_labels'] = gene_LF3['mutation_info'].map(dm_om_label_map)
# mapping 1.2: numeric
@ -1418,10 +1417,9 @@ gene_LF3['dst_multimode']
gene_LF3['dst_mode'] = gene_LF3['dst_multimode'].apply(lambda x: np.nanmax(x))
# sanity checks
gene_LF3['dst_noNA'].equals(gene_LF3['dst_mode'])
#gene_LF3['dst_noNA'].equals(gene_LF3['dst_mode'])
gene_LF3[drug].value_counts()
gene_LF3['dst_noNA'].value_counts()
#gene_LF3['dst_noNA'].value_counts()
gene_LF3['dst_mode'].value_counts()
foo = gene_LF3[['Mut', 'position', 'dst', 'dst_multimode', 'dst_noNA', 'dst_mode']]
@ -1443,10 +1441,10 @@ print('\n------------------------------------------------------'
, gene_LF3['dst_mode'].value_counts()
, '\nTotal samples [revised]', gene_LF3['dst_mode'].value_counts().sum()
, '\n----------------------------------'
, '\nRevised drug column count: dst_noNA\n'
, '\n----------------------------------\n'
, gene_LF3['dst_noNA'].value_counts()
# , '\n----------------------------------'
# , '\nRevised drug column count: dst_noNA\n'
# , '\n----------------------------------\n'
# , gene_LF3['dst_noNA'].value_counts()
)
#%% Create revised mutation_info_column based on dst_mode
#---------------------------------------
@ -1462,8 +1460,8 @@ gene_LF3['mutation_info_labels_v1'].value_counts() == gene_LF3['mutation_info_la
# Now overwrite
gene_LF3['mutation_info_labels_dst'] = gene_LF3['dst_mode'].map({1: 'DM', 0: 'OM'})
gene_LF3['mutation_info_labels'] = gene_LF3['dst_mode'].map({1: 'DM', 0: 'OM'})
if (gene_LF3['mutation_info_labels_dst'].value_counts() == gene_LF3['mutation_info_labels'].value_counts()):
print('\nRevised mutation_info colum created')
if all(gene_LF3['mutation_info_labels_dst'].value_counts() == gene_LF3['mutation_info_labels'].value_counts()):
print('\nPASS: Revised mutation_info column created successfully')
gene_LF3 = gene_LF3.drop(['mutation_info_labels_dst'], axis = 1)
else:
print('\nmutation info labels numbers mismatch'
@ -1474,6 +1472,11 @@ gene_LF3['mutation_info_orig'].value_counts()
gene_LF3['mutation_info_labels_orig'] = gene_LF3['mutation_info_orig'].map(dm_om_label_map)
gene_LF3['mutation_info_labels_orig'].value_counts()
#%% FIXME: Get multimode for dm_om_numeric column
#dm_om_multimode_LF4 = gene_LF3.groupby('mutationinformation')['dm_om_numeric_orig'].agg(multimode)
#dm_om_multimode_LF4
#gene_LF3['dst_multimode_numeric'] = gene_LF3['dst_multimode'].fillna(dm_om_multimode_LF4)
# %% sanity check for the revised dst
gene_LF3[drug].value_counts()
gene_LF3[drug].value_counts().sum()
@ -1485,4 +1488,156 @@ gene_LF3['dst_mode'].value_counts().sum()
# direct comparision
gene_LF3['dst'].value_counts()
gene_LF3['mutation_info_labels'].value_counts()
#%%
#%% Lineage
gene_LF3['lineage'].value_counts()
# lineage_label_numeric = {'lineage1' : 1
# , 'lineage2' : 2
# , 'lineage3' : 3
# , 'lineage4' : 4
# , 'lineage5' : 5
# , 'lineage6' : 6
# , 'lineage7' : 7
# , 'lineageBOV' : 8}
lineage_label_numeric = {'L1' : 1
, 'L2' : 2
, 'L3' : 3
, 'L4' : 4
, 'L5' : 5
, 'L6' : 6
, 'L7' : 7
, 'LBOV' : 8}
lineage_label_numeric
# copy column to allow cross checks after stripping white space
gene_LF3['lineage'] = gene_LF3['lineage'].str.strip()
gene_LF3['lineage_corrupt'] = gene_LF3['lineage']
#all(gene_LF3['lineage_corrupt'].value_counts() ==gene_LF3['lineage'].value_counts())
gene_LF3['lineage_corrupt'].value_counts()
#%%tidy_split(): Lineage
# Create df with tidy_split: lineage
lf_lin_split = tidy_split(gene_LF3, 'lineage_corrupt', sep = ';')
lf_lin_split['lineage_corrupt'] = lf_lin_split['lineage_corrupt'].str.strip()
lf_lin_split['lineage_corrupt'].value_counts()
# Map lineage labels to numbers to allow metrics
lf_lin_split['lineage_numeric'] = lf_lin_split['lineage_corrupt'].map(lineage_label_numeric)
lf_lin_split['lineage_numeric'].value_counts()
#--------------------------------
# Lineage_corrupt ALL values:
#--------------------------------
# Add all lineages for each mutation
lf_lin_split['lineage_corrupt_list'] = lf_lin_split['lineage_corrupt']
lf_lin_split['lineage_corrupt_list'].value_counts()
#lf_lin_split['lineage_corrupt_list'] = lf_lin_split['mutationinformation'].map(lf_lin_split.groupby('mutationinformati
lf_lin_split['lineage_corrupt_list'] = lf_lin_split.groupby('mutationinformation').lineage_corrupt_list.apply(list)
lf_lin_split['lineage_corrupt_list'].value_counts()
# Add lineage unique count
lf_lin_split['lineage_corrupt_ucount'] = lf_lin_split['Mut'].map(lf_lin_split.groupby('Mut')['lineage_corrupt'].nunique())
#lf_lin_split['lineage_corrupt_ucount'] = lf_lin_split['lineage_corrupt']
lf_lin_split['lineage_corrupt_ucount'].value_counts()
# Add lineage_set
lf_lin_split['lineage_set'] = lf_lin_split['lineage_corrupt_list'].apply(lambda x : set(list(x)))
lf_lin_split['lineage_ulist'] = lf_lin_split['lineage_set'].apply(lambda x : list(x))
#-------------------------------------
# Lineage numeric mode: multimode
#-------------------------------------
lf_lin_split['lineage_multimode'] = lf_lin_split.groupby('mutationinformation')['lineage_numeric'].agg(multimode)
lf_lin_split['lineage_multimode'].value_counts()
# cant take max as it doesn't mean anyting!
foo = lf_lin_split[['Mut', 'lineage_ulist']]
###############################################################################
#%% Final merge: gene_LF4 with lineage_split_df
gene_LF4 = gene_LF3.copy()
gene_LF4.index
gene_LF4['index_orig_copy'] = gene_LF4['index_orig']
lf_lin_split['index_orig_copy'] = lf_lin_split['index_orig']
#================================
# Merge with gene_LF3 with
# lf_lin_split baseed on index
#================================
# set index for lf_lin_split
lf_lin_split.index
lf_lin_split.reset_index(inplace=True)
lf_lin_split['mutationinformation']
lf_lin_split = lf_lin_split.set_index(['index_orig_copy'])
lf_lin_split.index
# set index for gene_LF4
gene_LF4.index
gene_LF4.reset_index(inplace=True)
gene_LF4.index
gene_LF4['mutationinformation']
gene_LF4['index_orig_copy']
gene_LF4 = gene_LF4.set_index(['index_orig_copy'])
gene_LF4.index
#-------------------------
# colum lineage_ucount:
# contribution of each distinct lineage
#-------------------------
gene_LF4['lineage_ucount'] = gene_LF4['lineage']
#-------------------------
# colum lineage list:
#-------------------------
#gene_LF4['lineage_set'] = gene_LF4['lineage']
gene_LF4['lineage_ulist'] = gene_LF4['lineage']
gene_LF4['lineage_list'] = gene_LF4['lineage']
#-------------------------
# colum lineage_list mode:
#-------------------------
gene_LF4['lineage_mode'] = gene_LF4['lineage']
# merge based on indices
gene_LF4.index.nunique()
lf_lin_split.index.nunique()
all(gene_LF4.index.isin(lf_lin_split.index))
all(lf_lin_split.index.isin(gene_LF4.index))
gene_LF4.index
lf_lin_split.index
if (gene_LF4.index.nunique() == lf_lin_split.index.nunique()) and ( all(gene_LF4.index.isin(lf_lin_split.index)) == all(lf_lin_split.index.isin(gene_LF4.index)) ):
print('\nPass: merging lineage_ucount with gene_LF4')
else:
sys.exit('\nFail: Indices mismatch, cannot merge! Quitting!')
###########################
# magic merge happens here
###########################
lf_lin_split.index.drop_duplicates(keep='first')
lf_lin_split = lf_lin_split
lf_lin_split_U = lf_lin_split[~lf_lin_split.index.duplicated(keep='first')]
lf_lin_split_U.shape
gene_LF4.loc[lf_lin_split_U.index, 'lineage_ucount'] = lf_lin_split_U['lineage_corrupt_ucount']
gene_LF4['lineage_ucount'].value_counts()
#gene_LF4.loc[lf_lin_split_U.index, 'lineage_set'] = lf_lin_split_U['lineage_set']
#gene_LF4['lineage_set'].value_counts()
gene_LF4.loc[lf_lin_split_U.index, 'lineage_ulist'] = lf_lin_split_U['lineage_ulist']
gene_LF4['lineage_ulist'].value_counts()
gene_LF4.loc[lf_lin_split_U.index, 'lineage_list'] = lf_lin_split_U['lineage_corrupt_list']
gene_LF4['lineage_list'].value_counts()
gene_LF4.loc[lf_lin_split_U.index, 'lineage_mode'] = lf_lin_split_U['lineage_multimode']
gene_LF4['lineage_mode'].value_counts()
foo = gene_LF4[['mutationinformation', 'lineage', 'lineage_ucount'
#, 'lineage_set'
, 'lineage_ulist'
, 'lineage_mode'
, 'lineage_list']]
#%%
#Subset relevant columns for output and put the rest of the output here