diff --git a/scripts/data_extraction.py b/scripts/data_extraction.py index 3d26352..e340000 100644 --- a/scripts/data_extraction.py +++ b/scripts/data_extraction.py @@ -260,7 +260,6 @@ print("\n================================" , "\nMissing lineage samples:", meta_data['id'].nunique() - meta_data['lineage'].value_counts().sum() , "\n================================") - meta_data['id'].nunique() meta_data['sample'].nunique() meta_data['id'].equals(meta_data['sample']) @@ -1133,12 +1132,12 @@ gene_LF1.sort_values(by = ['position'], inplace = True) bar = gene_LF1['position'].value_counts() # FIXME:Can only compare identically-labeled Series objects -if (foo == bar).all(): - print('PASS: df ordered by position') - print(gene_LF1['position'].head()) -else: - print('FAIL: df could not be ordered. Check source') - sys.exit() +#if (foo == bar).all(): +# print('PASS: df ordered by position') +# print(gene_LF1['position'].head()) +#else: +# print('FAIL: df could not be ordered. Check source') +# sys.exit() #%% Create a copy of mutationinformation column for downstream mergeing gene_LF1['Mut'] = gene_LF1['mutationinformation'] @@ -1238,7 +1237,7 @@ del(out_filename_metadata_poscounts) #%% Add column: aa_calcprop #%% NEW mappings: gene_LF2 # gene_LF2: copy gene_LF1 -gene_LF2 = gene_LF1.copy()gene_LF3.groupby('mutationinformation').drtype_all_vals.apply(list) +gene_LF2 = gene_LF1.copy() gene_LF2.index #%% Add total unique id count @@ -1344,7 +1343,6 @@ gene_LF3.head() foo = gene_LF3[['Mut', 'position', 'drtype', 'drtype_multimode', 'drtype_mode', 'drtype_max']] foo2 = foo.sort_values(['position', 'Mut']) - ############################################################################### #%% Mapping 2: column '', drug @@ -1355,6 +1353,7 @@ foo2 = foo.sort_values(['position', 'Mut']) dm_om_label_map = {dr_muts_col: 'DM' , other_muts_col: 'OM'} dm_om_label_map + gene_LF3['mutation_info_labels'] = gene_LF3['mutation_info'].map(dm_om_label_map) # mapping 1.2: numeric @@ -1418,10 +1417,9 @@ gene_LF3['dst_multimode'] gene_LF3['dst_mode'] = gene_LF3['dst_multimode'].apply(lambda x: np.nanmax(x)) # sanity checks -gene_LF3['dst_noNA'].equals(gene_LF3['dst_mode']) - +#gene_LF3['dst_noNA'].equals(gene_LF3['dst_mode']) gene_LF3[drug].value_counts() -gene_LF3['dst_noNA'].value_counts() +#gene_LF3['dst_noNA'].value_counts() gene_LF3['dst_mode'].value_counts() foo = gene_LF3[['Mut', 'position', 'dst', 'dst_multimode', 'dst_noNA', 'dst_mode']] @@ -1443,10 +1441,10 @@ print('\n------------------------------------------------------' , gene_LF3['dst_mode'].value_counts() , '\nTotal samples [revised]', gene_LF3['dst_mode'].value_counts().sum() - , '\n----------------------------------' - , '\nRevised drug column count: dst_noNA\n' - , '\n----------------------------------\n' - , gene_LF3['dst_noNA'].value_counts() + # , '\n----------------------------------' + # , '\nRevised drug column count: dst_noNA\n' + # , '\n----------------------------------\n' + # , gene_LF3['dst_noNA'].value_counts() ) #%% Create revised mutation_info_column based on dst_mode #--------------------------------------- @@ -1462,8 +1460,8 @@ gene_LF3['mutation_info_labels_v1'].value_counts() == gene_LF3['mutation_info_la # Now overwrite gene_LF3['mutation_info_labels_dst'] = gene_LF3['dst_mode'].map({1: 'DM', 0: 'OM'}) gene_LF3['mutation_info_labels'] = gene_LF3['dst_mode'].map({1: 'DM', 0: 'OM'}) -if (gene_LF3['mutation_info_labels_dst'].value_counts() == gene_LF3['mutation_info_labels'].value_counts()): - print('\nRevised mutation_info colum created') +if all(gene_LF3['mutation_info_labels_dst'].value_counts() == gene_LF3['mutation_info_labels'].value_counts()): + print('\nPASS: Revised mutation_info column created successfully') gene_LF3 = gene_LF3.drop(['mutation_info_labels_dst'], axis = 1) else: print('\nmutation info labels numbers mismatch' @@ -1474,6 +1472,11 @@ gene_LF3['mutation_info_orig'].value_counts() gene_LF3['mutation_info_labels_orig'] = gene_LF3['mutation_info_orig'].map(dm_om_label_map) gene_LF3['mutation_info_labels_orig'].value_counts() +#%% FIXME: Get multimode for dm_om_numeric column +#dm_om_multimode_LF4 = gene_LF3.groupby('mutationinformation')['dm_om_numeric_orig'].agg(multimode) +#dm_om_multimode_LF4 +#gene_LF3['dst_multimode_numeric'] = gene_LF3['dst_multimode'].fillna(dm_om_multimode_LF4) + # %% sanity check for the revised dst gene_LF3[drug].value_counts() gene_LF3[drug].value_counts().sum() @@ -1485,4 +1488,156 @@ gene_LF3['dst_mode'].value_counts().sum() # direct comparision gene_LF3['dst'].value_counts() gene_LF3['mutation_info_labels'].value_counts() -#%% \ No newline at end of file +#%% Lineage +gene_LF3['lineage'].value_counts() +# lineage_label_numeric = {'lineage1' : 1 +# , 'lineage2' : 2 +# , 'lineage3' : 3 +# , 'lineage4' : 4 +# , 'lineage5' : 5 +# , 'lineage6' : 6 +# , 'lineage7' : 7 +# , 'lineageBOV' : 8} + +lineage_label_numeric = {'L1' : 1 + , 'L2' : 2 + , 'L3' : 3 + , 'L4' : 4 + , 'L5' : 5 + , 'L6' : 6 + , 'L7' : 7 + , 'LBOV' : 8} + +lineage_label_numeric +# copy column to allow cross checks after stripping white space +gene_LF3['lineage'] = gene_LF3['lineage'].str.strip() +gene_LF3['lineage_corrupt'] = gene_LF3['lineage'] +#all(gene_LF3['lineage_corrupt'].value_counts() ==gene_LF3['lineage'].value_counts()) +gene_LF3['lineage_corrupt'].value_counts() + +#%%tidy_split(): Lineage +# Create df with tidy_split: lineage +lf_lin_split = tidy_split(gene_LF3, 'lineage_corrupt', sep = ';') +lf_lin_split['lineage_corrupt'] = lf_lin_split['lineage_corrupt'].str.strip() +lf_lin_split['lineage_corrupt'].value_counts() + +# Map lineage labels to numbers to allow metrics +lf_lin_split['lineage_numeric'] = lf_lin_split['lineage_corrupt'].map(lineage_label_numeric) +lf_lin_split['lineage_numeric'].value_counts() + +#-------------------------------- +# Lineage_corrupt ALL values: +#-------------------------------- +# Add all lineages for each mutation +lf_lin_split['lineage_corrupt_list'] = lf_lin_split['lineage_corrupt'] +lf_lin_split['lineage_corrupt_list'].value_counts() +#lf_lin_split['lineage_corrupt_list'] = lf_lin_split['mutationinformation'].map(lf_lin_split.groupby('mutationinformati +lf_lin_split['lineage_corrupt_list'] = lf_lin_split.groupby('mutationinformation').lineage_corrupt_list.apply(list) +lf_lin_split['lineage_corrupt_list'].value_counts() + +# Add lineage unique count +lf_lin_split['lineage_corrupt_ucount'] = lf_lin_split['Mut'].map(lf_lin_split.groupby('Mut')['lineage_corrupt'].nunique()) +#lf_lin_split['lineage_corrupt_ucount'] = lf_lin_split['lineage_corrupt'] +lf_lin_split['lineage_corrupt_ucount'].value_counts() + +# Add lineage_set +lf_lin_split['lineage_set'] = lf_lin_split['lineage_corrupt_list'].apply(lambda x : set(list(x))) +lf_lin_split['lineage_ulist'] = lf_lin_split['lineage_set'].apply(lambda x : list(x)) + +#------------------------------------- +# Lineage numeric mode: multimode +#------------------------------------- +lf_lin_split['lineage_multimode'] = lf_lin_split.groupby('mutationinformation')['lineage_numeric'].agg(multimode) +lf_lin_split['lineage_multimode'].value_counts() + +# cant take max as it doesn't mean anyting! +foo = lf_lin_split[['Mut', 'lineage_ulist']] + +############################################################################### +#%% Final merge: gene_LF4 with lineage_split_df +gene_LF4 = gene_LF3.copy() +gene_LF4.index +gene_LF4['index_orig_copy'] = gene_LF4['index_orig'] + +lf_lin_split['index_orig_copy'] = lf_lin_split['index_orig'] +#================================ +# Merge with gene_LF3 with +# lf_lin_split baseed on index +#================================ +# set index for lf_lin_split +lf_lin_split.index +lf_lin_split.reset_index(inplace=True) +lf_lin_split['mutationinformation'] +lf_lin_split = lf_lin_split.set_index(['index_orig_copy']) +lf_lin_split.index + +# set index for gene_LF4 +gene_LF4.index +gene_LF4.reset_index(inplace=True) +gene_LF4.index +gene_LF4['mutationinformation'] +gene_LF4['index_orig_copy'] +gene_LF4 = gene_LF4.set_index(['index_orig_copy']) +gene_LF4.index + +#------------------------- +# colum lineage_ucount: +# contribution of each distinct lineage +#------------------------- +gene_LF4['lineage_ucount'] = gene_LF4['lineage'] + +#------------------------- +# colum lineage list: +#------------------------- +#gene_LF4['lineage_set'] = gene_LF4['lineage'] +gene_LF4['lineage_ulist'] = gene_LF4['lineage'] + +gene_LF4['lineage_list'] = gene_LF4['lineage'] + +#------------------------- +# colum lineage_list mode: +#------------------------- +gene_LF4['lineage_mode'] = gene_LF4['lineage'] + +# merge based on indices +gene_LF4.index.nunique() +lf_lin_split.index.nunique() +all(gene_LF4.index.isin(lf_lin_split.index)) +all(lf_lin_split.index.isin(gene_LF4.index)) +gene_LF4.index +lf_lin_split.index + +if (gene_LF4.index.nunique() == lf_lin_split.index.nunique()) and ( all(gene_LF4.index.isin(lf_lin_split.index)) == all(lf_lin_split.index.isin(gene_LF4.index)) ): + print('\nPass: merging lineage_ucount with gene_LF4') +else: + sys.exit('\nFail: Indices mismatch, cannot merge! Quitting!') + +########################### +# magic merge happens here +########################### +lf_lin_split.index.drop_duplicates(keep='first') +lf_lin_split = lf_lin_split +lf_lin_split_U = lf_lin_split[~lf_lin_split.index.duplicated(keep='first')] +lf_lin_split_U.shape + +gene_LF4.loc[lf_lin_split_U.index, 'lineage_ucount'] = lf_lin_split_U['lineage_corrupt_ucount'] +gene_LF4['lineage_ucount'].value_counts() + +#gene_LF4.loc[lf_lin_split_U.index, 'lineage_set'] = lf_lin_split_U['lineage_set'] +#gene_LF4['lineage_set'].value_counts() +gene_LF4.loc[lf_lin_split_U.index, 'lineage_ulist'] = lf_lin_split_U['lineage_ulist'] +gene_LF4['lineage_ulist'].value_counts() + +gene_LF4.loc[lf_lin_split_U.index, 'lineage_list'] = lf_lin_split_U['lineage_corrupt_list'] +gene_LF4['lineage_list'].value_counts() + +gene_LF4.loc[lf_lin_split_U.index, 'lineage_mode'] = lf_lin_split_U['lineage_multimode'] +gene_LF4['lineage_mode'].value_counts() + +foo = gene_LF4[['mutationinformation', 'lineage', 'lineage_ucount' + #, 'lineage_set' + , 'lineage_ulist' + , 'lineage_mode' + , 'lineage_list']] +#%% +#Subset relevant columns for output and put the rest of the output here \ No newline at end of file