finally added all the lineage calculations

This commit is contained in:
Tanushree Tunstall 2022-04-15 14:41:04 +01:00
parent 95a73efdd2
commit bf7060baa9

View file

@ -240,6 +240,16 @@ meta_data['lineage'].value_counts()
meta_data['lineage'].value_counts().sum()
meta_data['lineage'].nunique()
# replace lineage with 'L' in lineage_labels
#meta_data['lineage_labels'] = meta_data['lineage']
#meta_data['lineage_labels'].equals(meta_data['lineage'])
#all(meta_data['lineage_labels'].value_counts() == meta_data['lineage'].value_counts())
#meta_data['lineage_labels'] = meta_data['lineage_labels'].str.replace('lineage', 'L')
#meta_data['lineage'].value_counts()
#meta_data['lineage_labels'].value_counts()
meta_data['lineage'] = meta_data['lineage'].str.replace('lineage', 'L')
meta_data['id'].nunique()
meta_data['sample'].nunique()
meta_data['id'].equals(meta_data['sample'])
@ -523,7 +533,6 @@ del(out_filename_cid)
# clear variables
del(dr_id, other_id, meta_data_dr, meta_data_other, common_ids, common_mut_ids, common_ids2)
#%% Extract gene specific nsSNPs: all nsSNPs i.e.'nssnp_match'
print('Extracting nsSNP match:', gene, 'mutations from cols:\n'
, dr_muts_col, 'and', other_muts_col, 'using string match:'
@ -911,9 +920,9 @@ ambiguous_muts_value_counts
gene_LF1_orig = gene_LF1.copy()
gene_LF1_orig.equals(gene_LF1)
# copy the old column for checking
# copy the old columns for checking
gene_LF1['mutation_info_orig'] = gene_LF1['mutation_info']
gene_LF1['mutation_info_orig'].value_counts()
gene_LF1['mutation_info_v1'] = gene_LF1['mutation_info']
gene_LF1['mutation_info'].value_counts()
#=====================================
@ -956,22 +965,23 @@ changes_val
changes_total = sum(changes_val)
changes_dict
# TODO: Add sanity check to make sure you can add value_count checks
#%%FIXME: TODO: Add sanity check to make sure you can add value_count checks
#%% Merging ambiguous muts
#=================
# Merge ambig muts
# with gene_LF1
#===================
ambig_muts_rev_df.index
gene_LF1.index
all(ambig_muts_rev_df.index.isin(gene_LF1.index))
gene_LF1.loc[ambig_muts_rev_df.index, 'mutation_info'] = ambig_muts_rev_df['mutation_info_REV']
gene_LF1.loc[ambig_muts_rev_df.index, 'mutation_info_v1'] = ambig_muts_rev_df['mutation_info_REV']
gene_LF1['mutation_info_orig'].value_counts()
gene_LF1['mutation_info'].value_counts()
gene_LF1['mutation_info_v1'].value_counts()
foo = gene_LF1.iloc[ambig_muts_rev_df.index]
foo[['mutation', 'mutation_info', 'mutation_info_orig']]
# Sanity check1: if there are still any ambiguous muts
muts_split_rev = list(gene_LF1.groupby('mutation_info'))
muts_split_rev = list(gene_LF1.groupby('mutation_info_v1'))
dr_muts_rev = muts_split_rev[0][1].mutation
other_muts_rev = muts_split_rev[1][1].mutation
print('splitting muts by mut_info:', muts_split_rev)
@ -981,7 +991,7 @@ print('no. of other_muts samples', len(other_muts_rev))
if not dr_muts_rev.isin(other_muts_rev).sum() & other_muts_rev.isin(dr_muts_rev).sum() > 0:
print('\nAmbiguous muts corrected. Proceeding with downstream analysis')
else:
print('\nAmbiguous muts corrected. Quitting!')
print('\nAmbiguous muts NOT corrected. Quitting!')
sys.exit()
#%% OUTFILE 3, write file: ambiguous muts and ambiguous mut counts
@ -1543,8 +1553,12 @@ gene_LF2['index_orig'] = gene_LF2.index # need it for setting back later
#%% FIXME: Add sanity check
#gene_LF2['index_orig'].equals(gene_LF2.index)
#%% Set index: 'mutationinformation' for adding multimode
gene_LF3 = gene_LF2.set_index(['mutationinformation'])
gene_LF2['Mut'] = gene_LF2['mutationinformation']
#%% Further mappings: gene_LF3
gene_LF3 = gene_LF2.set_index(['Mut'])
gene_LF3.index
#gene_LF3 = gene_LF2.set_index(['mutationinformation'])
gene_LF3['dst_multimode'].value_counts()
gene_LF3['dst_multimode'].value_counts().sum()
#%% Multimode: dst
@ -1640,6 +1654,15 @@ gene_LF3['mutation_info_labels_orig'].value_counts().sum()
gene_LF3['mutation_info_labels'].value_counts()
gene_LF3['mutation_info_labels'].value_counts().sum()
gene_LF3['mutation_info_labels'].value_counts()
gene_LF3['mutation_info'].value_counts()
gene_LF3['mutation_info_orig'].value_counts()
gene_LF3['mutation_info_orig'].value_counts().sum()
gene_LF3['mutation_info_v1'].value_counts()
gene_LF3['mutation_info_v1'].value_counts().sum()
#%% TEST muts
test_muts = ['G132A', 'V180F', 'G108R', 'A102P']
test_muts = ['L4S', 'L4W', 'A102P']
@ -1694,17 +1717,8 @@ foo_copy.loc[foo2.index, 'lineage_ucount'] = foo2['lineage_corrupt_ucount']
#--------------------------
# lineage multimode mode
#--------------------------
lineage_label_map = {'lineage1' : 'L1'
, 'lineage2' : 'L2'
, 'lineage3' : 'L3'
, 'lineage4' : 'L4'
, 'lineage5' : 'L5'
, 'lineage6' : 'L6'
, 'lineage7' : 'L7'
, 'lineageBOV' : 'LBOV'}
foo['lineage'].value_counts()
foo_updated = foo.replace(to_replace ='lineage', value = 'L', regex = True) # works
foo['lineage_labels'] = foo['lineage_labels'].str.replace('lineage', 'L')
# foo_updated = foo.replace(to_replace ='lineage', value = 'L', regex = True) # doesn't specify a column
foo['lineage_labels'] = foo['lineage']
#df['team'] = df['team'].apply(lambda x: re.sub(r'[\n\r]*','', str(x)))
@ -1727,8 +1741,8 @@ foo2['lineage_numeric'].value_counts()
foo2['lineage_numeric_list'] = foo2['mutationinformation'].map(foo2.groupby('mutationinformation')['lineage_numeric'].apply(list))
foo2['lineage_numeric_list']
foo2['lineage_multimode'] = foo2.groupby(['mutationinformation'])['lineage_numeric'].agg(multimode)
c2 = foo2[foo2.loc[:, 'MUT'].isin(['A102P'])]
c2['lineage_numeric'].value_counts()
@ -1743,36 +1757,160 @@ gene_LF3['lineage'].value_counts()
# Create a column: lineage_corrupt
gene_LF3['lineage_corrupt'] = gene_LF3['lineage']
##############################
# CHECK may be you only need it for multimode merge
# set index
# gene_LF3['index_orig_copy'] = gene_LF3['index_orig']
# gene_LF3['index_orig_copy'].head
# gene_LF3.index
# gene_LF3 = gene_LF3.set_index(['index_orig_copy'])
# gene_LF3.index
################################
# Create df with tidy_split: lineage
lf_lin_split = tidy_split(gene_LF3, 'lineage_corrupt', sep = ';')
lf_lin_split['lineage_corrupt'] = lf_lin_split['lineage_corrupt'].str.strip()
# Add all lineages for each mutation
lf_lin_split['lineage_corrupt_list'] = lf_lin_split['mutationinformation'].map(lf_lin_split.groupby('mutationinformation')['lineage_corrupt'].apply(list))
# Add lineage unique count
lf_lin_split['lineage_corrupt_ucount'] = lf_lin_split['mutationinformation'].map(lf_lin_split.groupby('mutationinformation')['lineage_corrupt'].nunique())
#----------------------------------
# Add lineage_set
lf_lin_split['lineage_set'] = lf_lin_split['lineage_corrupt_list'].apply(lambda x : set(list(x)))
lf_lin_split['lineage_ulist'] = lf_lin_split['lineage_set'].apply(lambda x : list(x))
#--------------
# Multimode lineage
# ONLY after split else the corrupt ones don't get mapped
#--------------
# Do this mapping after tidy split else the ones with the
lineage_label_numeric = {'L1' : 1
, 'L2' : 2
, 'L3' : 3
, 'L4' : 4
, 'L5' : 5
, 'L6' : 6
, 'L7' : 7
, 'LBOV' : 8}
# lineage_numeric = {'lineage1' : 1
# , 'lineage2' : 2
# , 'lineage3' : 3
# , 'lineage4' : 4
# , 'lineage5' : 5
# , 'lineage6' : 6
# , 'lineage7' : 7
# , 'lineageBOV' : 8}
lf_lin_split['lineage_corrupt'].value_counts()
lf_lin_split['lineage_numeric'] = lf_lin_split['lineage_corrupt'].map(lineage_label_numeric)
lf_lin_split['lineage_corrupt'].value_counts()
lf_lin_split['lineage_numeric'].value_counts()
lf_lin_split['lineage_numeric_list'] = lf_lin_split['mutationinformation'].map(lf_lin_split.groupby('mutationinformation')['lineage_numeric'].apply(list))
lf_lin_split['lineage_numeric_list']
#-------------------
# change indices
# to Mut as Multimode allows direct mapping this way!
#-------------------
lf_lin_split['Mut'] = lf_lin_split['mutationinformation']
lf_lin_split['Mut']
lf_lin_split = lf_lin_split.set_index(['Mut'])
lf_lin_split.index
lf_lin_split['lineage_multimode'] = lf_lin_split.groupby(['mutationinformation'])['lineage_numeric'].agg(multimode)
lf_lin_split['lineage_multimode'].value_counts()
#-------------------
# Reset indices
# to index orig to allow merge with gene_LF3
#-------------------
lf_lin_split.index
lf_lin_split['index_orig_copy'] = lf_lin_split['index_orig']
lf_lin_split = lf_lin_split.set_index(['index_orig_copy'])
lf_lin_split.index
###############################################################################
#================================
# Merge with gene_LF3 with
# lf_lin_split
#-----------------------------------
#================================
# set index
gene_LF3['index_orig_copy'] = gene_LF3['index_orig']
gene_LF3['index_orig_copy'].head
gene_LF3.index
gene_LF3 = gene_LF3.set_index(['index_orig_copy'])
gene_LF3.index
#-------------------------
# colum lineage_ucount:
# contribution of each distinct lineage
#-------------------------
gene_LF3['lineage_ucount'] = gene_LF3['lineage']
# quick checks
gene_LF3['lineage_ucount'].equals(gene_LF3['lineage'])
#-------------------------
# colum lineage list:
#-------------------------
#gene_LF3['lineage_set'] = gene_LF3['lineage']
gene_LF3['lineage_ulist'] = gene_LF3['lineage']
gene_LF3['lineage_list'] = gene_LF3['lineage']
#-------------------------
# colum lineage_list mode:
#-------------------------
gene_LF3['lineage_mode'] = gene_LF3['lineage']
########################
# merge based on indices
gene_LF3.index.nunique()
lf_lin_split.index.nunique()
all(gene_LF3.index.isin(lf_lin_split.index))
all(lf_lin_split.index.isin(gene_LF3.index))
gene_LF3.index
lf_lin_split.index
if (gene_LF3.index.nunique() == lf_lin_split.index.nunique()) and ( all(gene_LF3.index.isin(lf_lin_split.index)) == all(lf_lin_split.index.isin(gene_LF3.index)) ):
print('\nPass: merging lineage_ucount with gene_LF3')
else:
print('\nFail: Indices mismatch, cannot merge! Quitting!')
sys.exit()
###########################
# magic merge happens here
###########################
gene_LF3.loc[lf_lin_split.index, 'lineage_ucount'] = lf_lin_split['lineage_corrupt_ucount']
gene_LF3['lineage_ucount'].value_counts()
#gene_LF3.loc[lf_lin_split.index, 'lineage_set'] = lf_lin_split['lineage_set']
#gene_LF3['lineage_set'].value_counts()
gene_LF3.loc[lf_lin_split.index, 'lineage_ulist'] = lf_lin_split['lineage_ulist']
gene_LF3['lineage_ulist'].value_counts()
gene_LF3.loc[lf_lin_split.index, 'lineage_list'] = lf_lin_split['lineage_corrupt_list']
gene_LF3['lineage_list'].value_counts()
gene_LF3.loc[lf_lin_split.index, 'lineage_mode'] = lf_lin_split['lineage_multimode']
gene_LF3['lineage_mode'].value_counts()
foo = gene_LF3[['mutationinformation', 'lineage', 'lineage_ucount'
#, 'lineage_set'
, 'lineage_ulist'
, 'lineage_mode'
, 'lineage_list']]
#%% sanity checks
check1 = gene_LF3[['mutationinformation', 'lineage', 'lineage_ucount']]
check2 = check1[check1.loc[:, 'mutationinformation'].isin(['H57D'])]
check2.value_counts()
#%% Reset index: original indices [WAS above section Revised counts]
#%% Reset index: original indices [WAS above ]
#gene_LF3 = gene_LF3.reset_index()
gene_LF3.index
gene_LF3['mutationinformation'] = gene_LF3.index
@ -1780,5 +1918,14 @@ gene_LF3 = gene_LF3.set_index(['index_orig'])
gene_LF3[['mutationinformation']]
gene_LF3.index
#%% Remove MUT column not needed
# sanity check
if (all(gene_LF3['Mut'] == gene_LF3['mutationinformation'])):
print('\nPass: Mutationinformation check successful')
else:
sys.exit('\nERROR: mutationin cross checks failed. Please check your group_by() aggregate functions')
# Drop mutation column
gene_LF3.drop(['MUT'], axis = 1, inplace = True)
#%% ADD summary results
#%% final output file with selected columns