From bf7060baa9860c41ce880ac3a98edc622f16c2ac Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Fri, 15 Apr 2022 14:41:04 +0100 Subject: [PATCH] finally added all the lineage calculations --- scripts/data_extraction_v2.py | 207 +++++++++++++++++++++++++++++----- 1 file changed, 177 insertions(+), 30 deletions(-) diff --git a/scripts/data_extraction_v2.py b/scripts/data_extraction_v2.py index b21fa0d..d90c8c4 100644 --- a/scripts/data_extraction_v2.py +++ b/scripts/data_extraction_v2.py @@ -240,6 +240,16 @@ meta_data['lineage'].value_counts() meta_data['lineage'].value_counts().sum() meta_data['lineage'].nunique() +# replace lineage with 'L' in lineage_labels +#meta_data['lineage_labels'] = meta_data['lineage'] +#meta_data['lineage_labels'].equals(meta_data['lineage']) +#all(meta_data['lineage_labels'].value_counts() == meta_data['lineage'].value_counts()) +#meta_data['lineage_labels'] = meta_data['lineage_labels'].str.replace('lineage', 'L') +#meta_data['lineage'].value_counts() +#meta_data['lineage_labels'].value_counts() + +meta_data['lineage'] = meta_data['lineage'].str.replace('lineage', 'L') + meta_data['id'].nunique() meta_data['sample'].nunique() meta_data['id'].equals(meta_data['sample']) @@ -268,7 +278,7 @@ print('===========================================================\n' , 'RESULT: No. of Sus and Res', drug, 'samples:\n', meta_data[drug].value_counts() , '\n===========================================================\n' , 'RESULT: Percentage of Sus and Res', drug, 'samples:\n', meta_data[drug].value_counts(normalize = True)*100 - , '\n===========================================================') + , '\n===========================================================') #%% Important sanity checks # This is to find out how many samples have 1 and more than 1 mutation,so you # can use it to check if your data extraction process for dr_muts @@ -523,7 +533,6 @@ del(out_filename_cid) # clear variables del(dr_id, other_id, meta_data_dr, meta_data_other, common_ids, common_mut_ids, common_ids2) - #%% Extract gene specific nsSNPs: all nsSNPs i.e.'nssnp_match' print('Extracting nsSNP match:', gene, 'mutations from cols:\n' , dr_muts_col, 'and', other_muts_col, 'using string match:' @@ -911,9 +920,9 @@ ambiguous_muts_value_counts gene_LF1_orig = gene_LF1.copy() gene_LF1_orig.equals(gene_LF1) -# copy the old column for checking -gene_LF1['mutation_info_orig'] = gene_LF1['mutation_info'] -gene_LF1['mutation_info_orig'].value_counts() +# copy the old columns for checking +gene_LF1['mutation_info_orig'] = gene_LF1['mutation_info'] +gene_LF1['mutation_info_v1'] = gene_LF1['mutation_info'] gene_LF1['mutation_info'].value_counts() #===================================== @@ -956,22 +965,23 @@ changes_val changes_total = sum(changes_val) changes_dict -# TODO: Add sanity check to make sure you can add value_count checks - +#%%FIXME: TODO: Add sanity check to make sure you can add value_count checks +#%% Merging ambiguous muts #================= # Merge ambig muts # with gene_LF1 #=================== ambig_muts_rev_df.index +gene_LF1.index +all(ambig_muts_rev_df.index.isin(gene_LF1.index)) -gene_LF1.loc[ambig_muts_rev_df.index, 'mutation_info'] = ambig_muts_rev_df['mutation_info_REV'] +gene_LF1.loc[ambig_muts_rev_df.index, 'mutation_info_v1'] = ambig_muts_rev_df['mutation_info_REV'] gene_LF1['mutation_info_orig'].value_counts() -gene_LF1['mutation_info'].value_counts() +gene_LF1['mutation_info_v1'].value_counts() foo = gene_LF1.iloc[ambig_muts_rev_df.index] -foo[['mutation', 'mutation_info', 'mutation_info_orig']] # Sanity check1: if there are still any ambiguous muts -muts_split_rev = list(gene_LF1.groupby('mutation_info')) +muts_split_rev = list(gene_LF1.groupby('mutation_info_v1')) dr_muts_rev = muts_split_rev[0][1].mutation other_muts_rev = muts_split_rev[1][1].mutation print('splitting muts by mut_info:', muts_split_rev) @@ -981,7 +991,7 @@ print('no. of other_muts samples', len(other_muts_rev)) if not dr_muts_rev.isin(other_muts_rev).sum() & other_muts_rev.isin(dr_muts_rev).sum() > 0: print('\nAmbiguous muts corrected. Proceeding with downstream analysis') else: - print('\nAmbiguous muts corrected. Quitting!') + print('\nAmbiguous muts NOT corrected. Quitting!') sys.exit() #%% OUTFILE 3, write file: ambiguous muts and ambiguous mut counts @@ -1543,8 +1553,12 @@ gene_LF2['index_orig'] = gene_LF2.index # need it for setting back later #%% FIXME: Add sanity check #gene_LF2['index_orig'].equals(gene_LF2.index) #%% Set index: 'mutationinformation' for adding multimode -gene_LF3 = gene_LF2.set_index(['mutationinformation']) +gene_LF2['Mut'] = gene_LF2['mutationinformation'] +#%% Further mappings: gene_LF3 +gene_LF3 = gene_LF2.set_index(['Mut']) gene_LF3.index +#gene_LF3 = gene_LF2.set_index(['mutationinformation']) + gene_LF3['dst_multimode'].value_counts() gene_LF3['dst_multimode'].value_counts().sum() #%% Multimode: dst @@ -1640,6 +1654,15 @@ gene_LF3['mutation_info_labels_orig'].value_counts().sum() gene_LF3['mutation_info_labels'].value_counts() gene_LF3['mutation_info_labels'].value_counts().sum() + +gene_LF3['mutation_info_labels'].value_counts() +gene_LF3['mutation_info'].value_counts() +gene_LF3['mutation_info_orig'].value_counts() +gene_LF3['mutation_info_orig'].value_counts().sum() + +gene_LF3['mutation_info_v1'].value_counts() +gene_LF3['mutation_info_v1'].value_counts().sum() + #%% TEST muts test_muts = ['G132A', 'V180F', 'G108R', 'A102P'] test_muts = ['L4S', 'L4W', 'A102P'] @@ -1694,17 +1717,8 @@ foo_copy.loc[foo2.index, 'lineage_ucount'] = foo2['lineage_corrupt_ucount'] #-------------------------- # lineage multimode mode #-------------------------- -lineage_label_map = {'lineage1' : 'L1' - , 'lineage2' : 'L2' - , 'lineage3' : 'L3' - , 'lineage4' : 'L4' - , 'lineage5' : 'L5' - , 'lineage6' : 'L6' - , 'lineage7' : 'L7' - , 'lineageBOV' : 'LBOV'} - -foo['lineage'].value_counts() -foo_updated = foo.replace(to_replace ='lineage', value = 'L', regex = True) # works +foo['lineage_labels'] = foo['lineage_labels'].str.replace('lineage', 'L') +# foo_updated = foo.replace(to_replace ='lineage', value = 'L', regex = True) # doesn't specify a column foo['lineage_labels'] = foo['lineage'] #df['team'] = df['team'].apply(lambda x: re.sub(r'[\n\r]*','', str(x))) @@ -1727,8 +1741,8 @@ foo2['lineage_numeric'].value_counts() foo2['lineage_numeric_list'] = foo2['mutationinformation'].map(foo2.groupby('mutationinformation')['lineage_numeric'].apply(list)) foo2['lineage_numeric_list'] - foo2['lineage_multimode'] = foo2.groupby(['mutationinformation'])['lineage_numeric'].agg(multimode) + c2 = foo2[foo2.loc[:, 'MUT'].isin(['A102P'])] c2['lineage_numeric'].value_counts() @@ -1743,36 +1757,160 @@ gene_LF3['lineage'].value_counts() # Create a column: lineage_corrupt gene_LF3['lineage_corrupt'] = gene_LF3['lineage'] +############################## +# CHECK may be you only need it for multimode merge +# set index +# gene_LF3['index_orig_copy'] = gene_LF3['index_orig'] +# gene_LF3['index_orig_copy'].head + +# gene_LF3.index +# gene_LF3 = gene_LF3.set_index(['index_orig_copy']) +# gene_LF3.index +################################ + # Create df with tidy_split: lineage lf_lin_split = tidy_split(gene_LF3, 'lineage_corrupt', sep = ';') lf_lin_split['lineage_corrupt'] = lf_lin_split['lineage_corrupt'].str.strip() + +# Add all lineages for each mutation lf_lin_split['lineage_corrupt_list'] = lf_lin_split['mutationinformation'].map(lf_lin_split.groupby('mutationinformation')['lineage_corrupt'].apply(list)) + +# Add lineage unique count lf_lin_split['lineage_corrupt_ucount'] = lf_lin_split['mutationinformation'].map(lf_lin_split.groupby('mutationinformation')['lineage_corrupt'].nunique()) -#---------------------------------- +# Add lineage_set +lf_lin_split['lineage_set'] = lf_lin_split['lineage_corrupt_list'].apply(lambda x : set(list(x))) +lf_lin_split['lineage_ulist'] = lf_lin_split['lineage_set'].apply(lambda x : list(x)) + +#-------------- +# Multimode lineage +# ONLY after split else the corrupt ones don't get mapped +#-------------- +# Do this mapping after tidy split else the ones with the +lineage_label_numeric = {'L1' : 1 + , 'L2' : 2 + , 'L3' : 3 + , 'L4' : 4 + , 'L5' : 5 + , 'L6' : 6 + , 'L7' : 7 + , 'LBOV' : 8} + +# lineage_numeric = {'lineage1' : 1 +# , 'lineage2' : 2 +# , 'lineage3' : 3 +# , 'lineage4' : 4 +# , 'lineage5' : 5 +# , 'lineage6' : 6 +# , 'lineage7' : 7 +# , 'lineageBOV' : 8} + +lf_lin_split['lineage_corrupt'].value_counts() +lf_lin_split['lineage_numeric'] = lf_lin_split['lineage_corrupt'].map(lineage_label_numeric) +lf_lin_split['lineage_corrupt'].value_counts() +lf_lin_split['lineage_numeric'].value_counts() + +lf_lin_split['lineage_numeric_list'] = lf_lin_split['mutationinformation'].map(lf_lin_split.groupby('mutationinformation')['lineage_numeric'].apply(list)) +lf_lin_split['lineage_numeric_list'] + +#------------------- +# change indices +# to Mut as Multimode allows direct mapping this way! +#------------------- +lf_lin_split['Mut'] = lf_lin_split['mutationinformation'] +lf_lin_split['Mut'] +lf_lin_split = lf_lin_split.set_index(['Mut']) +lf_lin_split.index + +lf_lin_split['lineage_multimode'] = lf_lin_split.groupby(['mutationinformation'])['lineage_numeric'].agg(multimode) +lf_lin_split['lineage_multimode'].value_counts() + +#------------------- +# Reset indices +# to index orig to allow merge with gene_LF3 +#------------------- +lf_lin_split.index +lf_lin_split['index_orig_copy'] = lf_lin_split['index_orig'] +lf_lin_split = lf_lin_split.set_index(['index_orig_copy']) +lf_lin_split.index + +############################################################################### +#================================ # Merge with gene_LF3 with # lf_lin_split -#----------------------------------- +#================================ +# set index +gene_LF3['index_orig_copy'] = gene_LF3['index_orig'] +gene_LF3['index_orig_copy'].head + +gene_LF3.index +gene_LF3 = gene_LF3.set_index(['index_orig_copy']) +gene_LF3.index + +#------------------------- +# colum lineage_ucount: +# contribution of each distinct lineage +#------------------------- gene_LF3['lineage_ucount'] = gene_LF3['lineage'] -# quick checks -gene_LF3['lineage_ucount'].equals(gene_LF3['lineage']) +#------------------------- +# colum lineage list: +#------------------------- +#gene_LF3['lineage_set'] = gene_LF3['lineage'] +gene_LF3['lineage_ulist'] = gene_LF3['lineage'] + +gene_LF3['lineage_list'] = gene_LF3['lineage'] + +#------------------------- +# colum lineage_list mode: +#------------------------- +gene_LF3['lineage_mode'] = gene_LF3['lineage'] + +######################## # merge based on indices gene_LF3.index.nunique() lf_lin_split.index.nunique() all(gene_LF3.index.isin(lf_lin_split.index)) all(lf_lin_split.index.isin(gene_LF3.index)) +gene_LF3.index +lf_lin_split.index +if (gene_LF3.index.nunique() == lf_lin_split.index.nunique()) and ( all(gene_LF3.index.isin(lf_lin_split.index)) == all(lf_lin_split.index.isin(gene_LF3.index)) ): + print('\nPass: merging lineage_ucount with gene_LF3') +else: + print('\nFail: Indices mismatch, cannot merge! Quitting!') + sys.exit() + +########################### # magic merge happens here +########################### gene_LF3.loc[lf_lin_split.index, 'lineage_ucount'] = lf_lin_split['lineage_corrupt_ucount'] +gene_LF3['lineage_ucount'].value_counts() + +#gene_LF3.loc[lf_lin_split.index, 'lineage_set'] = lf_lin_split['lineage_set'] +#gene_LF3['lineage_set'].value_counts() +gene_LF3.loc[lf_lin_split.index, 'lineage_ulist'] = lf_lin_split['lineage_ulist'] +gene_LF3['lineage_ulist'].value_counts() + +gene_LF3.loc[lf_lin_split.index, 'lineage_list'] = lf_lin_split['lineage_corrupt_list'] +gene_LF3['lineage_list'].value_counts() + +gene_LF3.loc[lf_lin_split.index, 'lineage_mode'] = lf_lin_split['lineage_multimode'] +gene_LF3['lineage_mode'].value_counts() + +foo = gene_LF3[['mutationinformation', 'lineage', 'lineage_ucount' + #, 'lineage_set' + , 'lineage_ulist' + , 'lineage_mode' + , 'lineage_list']] #%% sanity checks check1 = gene_LF3[['mutationinformation', 'lineage', 'lineage_ucount']] check2 = check1[check1.loc[:, 'mutationinformation'].isin(['H57D'])] check2.value_counts() -#%% Reset index: original indices [WAS above section Revised counts] +#%% Reset index: original indices [WAS above ] #gene_LF3 = gene_LF3.reset_index() gene_LF3.index gene_LF3['mutationinformation'] = gene_LF3.index @@ -1780,5 +1918,14 @@ gene_LF3 = gene_LF3.set_index(['index_orig']) gene_LF3[['mutationinformation']] gene_LF3.index +#%% Remove MUT column not needed +# sanity check +if (all(gene_LF3['Mut'] == gene_LF3['mutationinformation'])): + print('\nPass: Mutationinformation check successful') +else: + sys.exit('\nERROR: mutationin cross checks failed. Please check your group_by() aggregate functions') + +# Drop mutation column +gene_LF3.drop(['MUT'], axis = 1, inplace = True) #%% ADD summary results #%% final output file with selected columns \ No newline at end of file