finally added all the lineage calculations
This commit is contained in:
parent
95a73efdd2
commit
bf7060baa9
1 changed files with 177 additions and 30 deletions
|
@ -240,6 +240,16 @@ meta_data['lineage'].value_counts()
|
|||
meta_data['lineage'].value_counts().sum()
|
||||
meta_data['lineage'].nunique()
|
||||
|
||||
# replace lineage with 'L' in lineage_labels
|
||||
#meta_data['lineage_labels'] = meta_data['lineage']
|
||||
#meta_data['lineage_labels'].equals(meta_data['lineage'])
|
||||
#all(meta_data['lineage_labels'].value_counts() == meta_data['lineage'].value_counts())
|
||||
#meta_data['lineage_labels'] = meta_data['lineage_labels'].str.replace('lineage', 'L')
|
||||
#meta_data['lineage'].value_counts()
|
||||
#meta_data['lineage_labels'].value_counts()
|
||||
|
||||
meta_data['lineage'] = meta_data['lineage'].str.replace('lineage', 'L')
|
||||
|
||||
meta_data['id'].nunique()
|
||||
meta_data['sample'].nunique()
|
||||
meta_data['id'].equals(meta_data['sample'])
|
||||
|
@ -268,7 +278,7 @@ print('===========================================================\n'
|
|||
, 'RESULT: No. of Sus and Res', drug, 'samples:\n', meta_data[drug].value_counts()
|
||||
, '\n===========================================================\n'
|
||||
, 'RESULT: Percentage of Sus and Res', drug, 'samples:\n', meta_data[drug].value_counts(normalize = True)*100
|
||||
, '\n===========================================================')
|
||||
, '\n===========================================================')
|
||||
#%% Important sanity checks
|
||||
# This is to find out how many samples have 1 and more than 1 mutation,so you
|
||||
# can use it to check if your data extraction process for dr_muts
|
||||
|
@ -523,7 +533,6 @@ del(out_filename_cid)
|
|||
|
||||
# clear variables
|
||||
del(dr_id, other_id, meta_data_dr, meta_data_other, common_ids, common_mut_ids, common_ids2)
|
||||
|
||||
#%% Extract gene specific nsSNPs: all nsSNPs i.e.'nssnp_match'
|
||||
print('Extracting nsSNP match:', gene, 'mutations from cols:\n'
|
||||
, dr_muts_col, 'and', other_muts_col, 'using string match:'
|
||||
|
@ -911,9 +920,9 @@ ambiguous_muts_value_counts
|
|||
gene_LF1_orig = gene_LF1.copy()
|
||||
gene_LF1_orig.equals(gene_LF1)
|
||||
|
||||
# copy the old column for checking
|
||||
gene_LF1['mutation_info_orig'] = gene_LF1['mutation_info']
|
||||
gene_LF1['mutation_info_orig'].value_counts()
|
||||
# copy the old columns for checking
|
||||
gene_LF1['mutation_info_orig'] = gene_LF1['mutation_info']
|
||||
gene_LF1['mutation_info_v1'] = gene_LF1['mutation_info']
|
||||
gene_LF1['mutation_info'].value_counts()
|
||||
|
||||
#=====================================
|
||||
|
@ -956,22 +965,23 @@ changes_val
|
|||
changes_total = sum(changes_val)
|
||||
changes_dict
|
||||
|
||||
# TODO: Add sanity check to make sure you can add value_count checks
|
||||
|
||||
#%%FIXME: TODO: Add sanity check to make sure you can add value_count checks
|
||||
#%% Merging ambiguous muts
|
||||
#=================
|
||||
# Merge ambig muts
|
||||
# with gene_LF1
|
||||
#===================
|
||||
ambig_muts_rev_df.index
|
||||
gene_LF1.index
|
||||
all(ambig_muts_rev_df.index.isin(gene_LF1.index))
|
||||
|
||||
gene_LF1.loc[ambig_muts_rev_df.index, 'mutation_info'] = ambig_muts_rev_df['mutation_info_REV']
|
||||
gene_LF1.loc[ambig_muts_rev_df.index, 'mutation_info_v1'] = ambig_muts_rev_df['mutation_info_REV']
|
||||
gene_LF1['mutation_info_orig'].value_counts()
|
||||
gene_LF1['mutation_info'].value_counts()
|
||||
gene_LF1['mutation_info_v1'].value_counts()
|
||||
foo = gene_LF1.iloc[ambig_muts_rev_df.index]
|
||||
foo[['mutation', 'mutation_info', 'mutation_info_orig']]
|
||||
|
||||
# Sanity check1: if there are still any ambiguous muts
|
||||
muts_split_rev = list(gene_LF1.groupby('mutation_info'))
|
||||
muts_split_rev = list(gene_LF1.groupby('mutation_info_v1'))
|
||||
dr_muts_rev = muts_split_rev[0][1].mutation
|
||||
other_muts_rev = muts_split_rev[1][1].mutation
|
||||
print('splitting muts by mut_info:', muts_split_rev)
|
||||
|
@ -981,7 +991,7 @@ print('no. of other_muts samples', len(other_muts_rev))
|
|||
if not dr_muts_rev.isin(other_muts_rev).sum() & other_muts_rev.isin(dr_muts_rev).sum() > 0:
|
||||
print('\nAmbiguous muts corrected. Proceeding with downstream analysis')
|
||||
else:
|
||||
print('\nAmbiguous muts corrected. Quitting!')
|
||||
print('\nAmbiguous muts NOT corrected. Quitting!')
|
||||
sys.exit()
|
||||
|
||||
#%% OUTFILE 3, write file: ambiguous muts and ambiguous mut counts
|
||||
|
@ -1543,8 +1553,12 @@ gene_LF2['index_orig'] = gene_LF2.index # need it for setting back later
|
|||
#%% FIXME: Add sanity check
|
||||
#gene_LF2['index_orig'].equals(gene_LF2.index)
|
||||
#%% Set index: 'mutationinformation' for adding multimode
|
||||
gene_LF3 = gene_LF2.set_index(['mutationinformation'])
|
||||
gene_LF2['Mut'] = gene_LF2['mutationinformation']
|
||||
#%% Further mappings: gene_LF3
|
||||
gene_LF3 = gene_LF2.set_index(['Mut'])
|
||||
gene_LF3.index
|
||||
#gene_LF3 = gene_LF2.set_index(['mutationinformation'])
|
||||
|
||||
gene_LF3['dst_multimode'].value_counts()
|
||||
gene_LF3['dst_multimode'].value_counts().sum()
|
||||
#%% Multimode: dst
|
||||
|
@ -1640,6 +1654,15 @@ gene_LF3['mutation_info_labels_orig'].value_counts().sum()
|
|||
|
||||
gene_LF3['mutation_info_labels'].value_counts()
|
||||
gene_LF3['mutation_info_labels'].value_counts().sum()
|
||||
|
||||
gene_LF3['mutation_info_labels'].value_counts()
|
||||
gene_LF3['mutation_info'].value_counts()
|
||||
gene_LF3['mutation_info_orig'].value_counts()
|
||||
gene_LF3['mutation_info_orig'].value_counts().sum()
|
||||
|
||||
gene_LF3['mutation_info_v1'].value_counts()
|
||||
gene_LF3['mutation_info_v1'].value_counts().sum()
|
||||
|
||||
#%% TEST muts
|
||||
test_muts = ['G132A', 'V180F', 'G108R', 'A102P']
|
||||
test_muts = ['L4S', 'L4W', 'A102P']
|
||||
|
@ -1694,17 +1717,8 @@ foo_copy.loc[foo2.index, 'lineage_ucount'] = foo2['lineage_corrupt_ucount']
|
|||
#--------------------------
|
||||
# lineage multimode mode
|
||||
#--------------------------
|
||||
lineage_label_map = {'lineage1' : 'L1'
|
||||
, 'lineage2' : 'L2'
|
||||
, 'lineage3' : 'L3'
|
||||
, 'lineage4' : 'L4'
|
||||
, 'lineage5' : 'L5'
|
||||
, 'lineage6' : 'L6'
|
||||
, 'lineage7' : 'L7'
|
||||
, 'lineageBOV' : 'LBOV'}
|
||||
|
||||
foo['lineage'].value_counts()
|
||||
foo_updated = foo.replace(to_replace ='lineage', value = 'L', regex = True) # works
|
||||
foo['lineage_labels'] = foo['lineage_labels'].str.replace('lineage', 'L')
|
||||
# foo_updated = foo.replace(to_replace ='lineage', value = 'L', regex = True) # doesn't specify a column
|
||||
foo['lineage_labels'] = foo['lineage']
|
||||
|
||||
#df['team'] = df['team'].apply(lambda x: re.sub(r'[\n\r]*','', str(x)))
|
||||
|
@ -1727,8 +1741,8 @@ foo2['lineage_numeric'].value_counts()
|
|||
|
||||
foo2['lineage_numeric_list'] = foo2['mutationinformation'].map(foo2.groupby('mutationinformation')['lineage_numeric'].apply(list))
|
||||
foo2['lineage_numeric_list']
|
||||
|
||||
foo2['lineage_multimode'] = foo2.groupby(['mutationinformation'])['lineage_numeric'].agg(multimode)
|
||||
|
||||
c2 = foo2[foo2.loc[:, 'MUT'].isin(['A102P'])]
|
||||
c2['lineage_numeric'].value_counts()
|
||||
|
||||
|
@ -1743,36 +1757,160 @@ gene_LF3['lineage'].value_counts()
|
|||
# Create a column: lineage_corrupt
|
||||
gene_LF3['lineage_corrupt'] = gene_LF3['lineage']
|
||||
|
||||
##############################
|
||||
# CHECK may be you only need it for multimode merge
|
||||
# set index
|
||||
# gene_LF3['index_orig_copy'] = gene_LF3['index_orig']
|
||||
# gene_LF3['index_orig_copy'].head
|
||||
|
||||
# gene_LF3.index
|
||||
# gene_LF3 = gene_LF3.set_index(['index_orig_copy'])
|
||||
# gene_LF3.index
|
||||
################################
|
||||
|
||||
# Create df with tidy_split: lineage
|
||||
lf_lin_split = tidy_split(gene_LF3, 'lineage_corrupt', sep = ';')
|
||||
lf_lin_split['lineage_corrupt'] = lf_lin_split['lineage_corrupt'].str.strip()
|
||||
|
||||
# Add all lineages for each mutation
|
||||
lf_lin_split['lineage_corrupt_list'] = lf_lin_split['mutationinformation'].map(lf_lin_split.groupby('mutationinformation')['lineage_corrupt'].apply(list))
|
||||
|
||||
# Add lineage unique count
|
||||
lf_lin_split['lineage_corrupt_ucount'] = lf_lin_split['mutationinformation'].map(lf_lin_split.groupby('mutationinformation')['lineage_corrupt'].nunique())
|
||||
|
||||
#----------------------------------
|
||||
# Add lineage_set
|
||||
lf_lin_split['lineage_set'] = lf_lin_split['lineage_corrupt_list'].apply(lambda x : set(list(x)))
|
||||
lf_lin_split['lineage_ulist'] = lf_lin_split['lineage_set'].apply(lambda x : list(x))
|
||||
|
||||
#--------------
|
||||
# Multimode lineage
|
||||
# ONLY after split else the corrupt ones don't get mapped
|
||||
#--------------
|
||||
# Do this mapping after tidy split else the ones with the
|
||||
lineage_label_numeric = {'L1' : 1
|
||||
, 'L2' : 2
|
||||
, 'L3' : 3
|
||||
, 'L4' : 4
|
||||
, 'L5' : 5
|
||||
, 'L6' : 6
|
||||
, 'L7' : 7
|
||||
, 'LBOV' : 8}
|
||||
|
||||
# lineage_numeric = {'lineage1' : 1
|
||||
# , 'lineage2' : 2
|
||||
# , 'lineage3' : 3
|
||||
# , 'lineage4' : 4
|
||||
# , 'lineage5' : 5
|
||||
# , 'lineage6' : 6
|
||||
# , 'lineage7' : 7
|
||||
# , 'lineageBOV' : 8}
|
||||
|
||||
lf_lin_split['lineage_corrupt'].value_counts()
|
||||
lf_lin_split['lineage_numeric'] = lf_lin_split['lineage_corrupt'].map(lineage_label_numeric)
|
||||
lf_lin_split['lineage_corrupt'].value_counts()
|
||||
lf_lin_split['lineage_numeric'].value_counts()
|
||||
|
||||
lf_lin_split['lineage_numeric_list'] = lf_lin_split['mutationinformation'].map(lf_lin_split.groupby('mutationinformation')['lineage_numeric'].apply(list))
|
||||
lf_lin_split['lineage_numeric_list']
|
||||
|
||||
#-------------------
|
||||
# change indices
|
||||
# to Mut as Multimode allows direct mapping this way!
|
||||
#-------------------
|
||||
lf_lin_split['Mut'] = lf_lin_split['mutationinformation']
|
||||
lf_lin_split['Mut']
|
||||
lf_lin_split = lf_lin_split.set_index(['Mut'])
|
||||
lf_lin_split.index
|
||||
|
||||
lf_lin_split['lineage_multimode'] = lf_lin_split.groupby(['mutationinformation'])['lineage_numeric'].agg(multimode)
|
||||
lf_lin_split['lineage_multimode'].value_counts()
|
||||
|
||||
#-------------------
|
||||
# Reset indices
|
||||
# to index orig to allow merge with gene_LF3
|
||||
#-------------------
|
||||
lf_lin_split.index
|
||||
lf_lin_split['index_orig_copy'] = lf_lin_split['index_orig']
|
||||
lf_lin_split = lf_lin_split.set_index(['index_orig_copy'])
|
||||
lf_lin_split.index
|
||||
|
||||
###############################################################################
|
||||
#================================
|
||||
# Merge with gene_LF3 with
|
||||
# lf_lin_split
|
||||
#-----------------------------------
|
||||
#================================
|
||||
# set index
|
||||
gene_LF3['index_orig_copy'] = gene_LF3['index_orig']
|
||||
gene_LF3['index_orig_copy'].head
|
||||
|
||||
gene_LF3.index
|
||||
gene_LF3 = gene_LF3.set_index(['index_orig_copy'])
|
||||
gene_LF3.index
|
||||
|
||||
#-------------------------
|
||||
# colum lineage_ucount:
|
||||
# contribution of each distinct lineage
|
||||
#-------------------------
|
||||
gene_LF3['lineage_ucount'] = gene_LF3['lineage']
|
||||
|
||||
# quick checks
|
||||
gene_LF3['lineage_ucount'].equals(gene_LF3['lineage'])
|
||||
#-------------------------
|
||||
# colum lineage list:
|
||||
#-------------------------
|
||||
#gene_LF3['lineage_set'] = gene_LF3['lineage']
|
||||
gene_LF3['lineage_ulist'] = gene_LF3['lineage']
|
||||
|
||||
gene_LF3['lineage_list'] = gene_LF3['lineage']
|
||||
|
||||
#-------------------------
|
||||
# colum lineage_list mode:
|
||||
#-------------------------
|
||||
gene_LF3['lineage_mode'] = gene_LF3['lineage']
|
||||
|
||||
########################
|
||||
|
||||
# merge based on indices
|
||||
gene_LF3.index.nunique()
|
||||
lf_lin_split.index.nunique()
|
||||
all(gene_LF3.index.isin(lf_lin_split.index))
|
||||
all(lf_lin_split.index.isin(gene_LF3.index))
|
||||
gene_LF3.index
|
||||
lf_lin_split.index
|
||||
|
||||
if (gene_LF3.index.nunique() == lf_lin_split.index.nunique()) and ( all(gene_LF3.index.isin(lf_lin_split.index)) == all(lf_lin_split.index.isin(gene_LF3.index)) ):
|
||||
print('\nPass: merging lineage_ucount with gene_LF3')
|
||||
else:
|
||||
print('\nFail: Indices mismatch, cannot merge! Quitting!')
|
||||
sys.exit()
|
||||
|
||||
###########################
|
||||
# magic merge happens here
|
||||
###########################
|
||||
gene_LF3.loc[lf_lin_split.index, 'lineage_ucount'] = lf_lin_split['lineage_corrupt_ucount']
|
||||
gene_LF3['lineage_ucount'].value_counts()
|
||||
|
||||
#gene_LF3.loc[lf_lin_split.index, 'lineage_set'] = lf_lin_split['lineage_set']
|
||||
#gene_LF3['lineage_set'].value_counts()
|
||||
gene_LF3.loc[lf_lin_split.index, 'lineage_ulist'] = lf_lin_split['lineage_ulist']
|
||||
gene_LF3['lineage_ulist'].value_counts()
|
||||
|
||||
gene_LF3.loc[lf_lin_split.index, 'lineage_list'] = lf_lin_split['lineage_corrupt_list']
|
||||
gene_LF3['lineage_list'].value_counts()
|
||||
|
||||
gene_LF3.loc[lf_lin_split.index, 'lineage_mode'] = lf_lin_split['lineage_multimode']
|
||||
gene_LF3['lineage_mode'].value_counts()
|
||||
|
||||
foo = gene_LF3[['mutationinformation', 'lineage', 'lineage_ucount'
|
||||
#, 'lineage_set'
|
||||
, 'lineage_ulist'
|
||||
, 'lineage_mode'
|
||||
, 'lineage_list']]
|
||||
|
||||
#%% sanity checks
|
||||
check1 = gene_LF3[['mutationinformation', 'lineage', 'lineage_ucount']]
|
||||
check2 = check1[check1.loc[:, 'mutationinformation'].isin(['H57D'])]
|
||||
check2.value_counts()
|
||||
|
||||
#%% Reset index: original indices [WAS above section Revised counts]
|
||||
#%% Reset index: original indices [WAS above ]
|
||||
#gene_LF3 = gene_LF3.reset_index()
|
||||
gene_LF3.index
|
||||
gene_LF3['mutationinformation'] = gene_LF3.index
|
||||
|
@ -1780,5 +1918,14 @@ gene_LF3 = gene_LF3.set_index(['index_orig'])
|
|||
|
||||
gene_LF3[['mutationinformation']]
|
||||
gene_LF3.index
|
||||
#%% Remove MUT column not needed
|
||||
# sanity check
|
||||
if (all(gene_LF3['Mut'] == gene_LF3['mutationinformation'])):
|
||||
print('\nPass: Mutationinformation check successful')
|
||||
else:
|
||||
sys.exit('\nERROR: mutationin cross checks failed. Please check your group_by() aggregate functions')
|
||||
|
||||
# Drop mutation column
|
||||
gene_LF3.drop(['MUT'], axis = 1, inplace = True)
|
||||
#%% ADD summary results
|
||||
#%% final output file with selected columns
|
Loading…
Add table
Add a link
Reference in a new issue