From e54ae877a80d8ec168111c608dbe14b13f814ff2 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Thu, 5 May 2022 13:35:24 +0100 Subject: [PATCH] updated data extraction to ensure genes without common mutations and duplicate indices can run from the cmd --- scripts/data_extraction.py | 312 ++++++++++++++++++++----------------- 1 file changed, 170 insertions(+), 142 deletions(-) diff --git a/scripts/data_extraction.py b/scripts/data_extraction.py index bc418d1..c610489 100755 --- a/scripts/data_extraction.py +++ b/scripts/data_extraction.py @@ -376,7 +376,7 @@ print('===========================================================\n' search = ";" # count of occurrence of ";" in dr_muts_col: No.of semicolons + 1 is no. of rows created * occurence -count_df_dr = meta_gene_dr[['id', dr_muts_col]] +count_df_dr = meta_gene_dr[['id', dr_muts_col]].copy() count_df_dr['dr_semicolon_count'] = meta_gene_dr.loc[:, dr_muts_col].str.count(search, re.I) dr_sc_C = count_df_dr['dr_semicolon_count'].value_counts().reset_index() dr_sc_C @@ -502,7 +502,7 @@ if other_muts_col in dr_df.columns: #%% Important sanity checks: other muts column for tidy split(), nsSNPs, etc. # Split based on semi colon on other_muts_col # count of occurrence of ";" in other_muts_col: No.of semicolons + 1 is no. of rows created * occurence -count_df_other = meta_gene_other[['id', other_muts_col]] +count_df_other = meta_gene_other[['id', other_muts_col]].copy() count_df_other['other_semicolon_count'] = meta_gene_other.loc[:, other_muts_col].str.count(search, re.I) other_sc_C = count_df_other['other_semicolon_count'].value_counts().reset_index() other_sc_C @@ -790,7 +790,8 @@ print('Length of gene_LF0:', len(gene_LF0) , '\nThis should be what we need. But just double checking and extracting nsSNP for', gene , '\nfrom LF0 (concatenated data) using case insensitive regex match:', nssnp_match) -gene_LF1 = gene_LF0[gene_LF0['mutation'].str.contains(nssnp_match, regex = True, case = False)] +#gene_LF1 = gene_LF0[gene_LF0['mutation'].str.contains(nssnp_match, regex = True, case = False)] +gene_LF1 = gene_LF0[gene_LF0['mutation'].str.contains(nssnp_match, regex = True, case = False)].copy() if len(gene_LF0) == len(gene_LF1): print('PASS: length of gene_LF0 and gene_LF1 match', @@ -859,8 +860,7 @@ other_muts = muts_split[1][1].mutation print('splitting muts by mut_info:', muts_split) print('no.of dr_muts samples:', len(dr_muts)) print('no. of other_muts samples', len(other_muts)) -#%% Ambiguous muts -# IMPORTANT: The same mutation cannot be classed as a drug AND 'others' +#%% Ambiguous muts: the same mutation cannot be classed as a drug AND 'others' if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0: print('WARNING: Ambiguous muts detected in dr_ and other_ mutation category' , '\n===============================================================') @@ -894,139 +894,166 @@ if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0: print('\n===========================================================') else: #sys.exit('Error: ambiguous muts present, but extraction failed. Debug!') - print('No: ambiguous muts present') - -#%% Ambiguous muts: revised annotation for mutation_info -ambiguous_muts_df = gene_LF1[gene_LF1['mutation'].isin(common_muts)] -ambiguous_muts_value_counts = ambiguous_muts_df.groupby('mutation')['mutation_info'].value_counts() -ambiguous_muts_value_counts + print('Ambiguous muts are NOT present') +#%% DOES NOT depend on common_muts gene_LF1_orig = gene_LF1.copy() gene_LF1_orig.equals(gene_LF1) # copy the old columns for checking -gene_LF1['mutation_info_orig'] = gene_LF1['mutation_info'] +gene_LF1['mutation_info_orig'] = gene_LF1['mutation_info'] gene_LF1['mutation_info_v1'] = gene_LF1['mutation_info'] gene_LF1['mutation_info'].value_counts() -#%% Inspect ambiguous muts -#===================================== -# Now create a new df that will have: - # ambiguous muts - # mutation_info - # revised mutation_info -# The revised label is based on value_counts -# for mutaiton_info. The corresponding mutation_info: -# label is chosen that corresponds to the max of value counts -#===================================== -ambig_muts_rev_df = pd.DataFrame() -changes_val = [] -changes_dict = {} -##BROKENNNN!!!! -common_muts -gene_LF1['mutation'].head() -#common_muts_lower = list((map(lambda x: x.lower(), common_muts))) -#common_muts_lower - -for i in common_muts: -#for i in common_muts_lower: - #print(i) - temp_df = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info_orig']] - temp_df - # DANGER: ASSUMES TWO STATES ONLY and that value_counts sorts by descending - max_info_table = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info_orig']].value_counts() - max_info_table - revised_label = max_info_table[[0]].index[0][1] # max value_count - old_label = max_info_table[[1]].index[0][1] # min value_count - print('\nAmbiguous mutation labels...' - , '\nSetting mutation_info for', i, 'to', revised_label) - temp_df['mutation_info_REV'] = np.where( (temp_df['mutation_info_orig'] == old_label) - , revised_label - , temp_df['mutation_info_orig']) - ambig_muts_rev_df = pd.concat([ambig_muts_rev_df,temp_df]) - f = max_info_table[[1]] - old_label_count = f[[0]][0] - changes_val.append(old_label_count) - cc_dict = f.to_dict() - changes_dict.update(cc_dict) - -ambig_muts_rev_df['mutation_info_REV'].value_counts() -ambig_muts_rev_df['mutation_info_orig'].value_counts() -changes_val -changes_total = sum(changes_val) -changes_dict -#%% OUTFILE 1, write file: ambiguous muts and ambiguous mut counts -#================== -# ambiguous muts -#================== -#dr_muts.XXX_csvXXXX('dr_muts.csv', header = True) -#other_muts.XXXX_csvXXX('other_muts.csv', header = True) -if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0: - out_filename_ambig_muts = gene.lower() + '_ambiguous_muts.csv' - outfile_ambig_muts = outdir + '/' + out_filename_ambig_muts - print('\n----------------------------------' - , '\nWriting file: ambiguous muts' - , '\n----------------------------------' - , '\nFilename:', outfile_ambig_muts) - inspect = gene_LF1[gene_LF1['mutation'].isin(common_muts)] - inspect.to_csv(outfile_ambig_muts, index = True) - - print('Finished writing:', out_filename_ambig_muts - , '\nNo. of rows:', len(inspect) - , '\nNo. of cols:', len(inspect.columns) - , '\nNo. of rows = no. of samples with the ambiguous muts present:' - , dr_muts.isin(other_muts).sum() + other_muts.isin(dr_muts).sum() - , '\n=============================================================') - del(out_filename_ambig_muts) +#%% Ambiguous muts: revised annotation for mutation_info +if 'common_muts' in globals(): + ambiguous_muts_df = gene_LF1[gene_LF1['mutation'].isin(common_muts)] + ambiguous_muts_value_counts = ambiguous_muts_df.groupby('mutation')['mutation_info'].value_counts() + ambiguous_muts_value_counts -#%% OUTFILE 2, write file: ambiguous mut counts -#====================== -# ambiguous mut counts -#====================== -out_filename_ambig_mut_counts = gene.lower() + '_ambiguous_mut_counts.csv' -outfile_ambig_mut_counts = outdir + '/' + out_filename_ambig_mut_counts -print('\n----------------------------------' - , '\nWriting file: ambiguous mut counts' - , '\n----------------------------------' - , '\nFilename:', outfile_ambig_mut_counts) + # gene_LF1_orig = gene_LF1.copy() + # gene_LF1_orig.equals(gene_LF1) + + # # copy the old columns for checking + # gene_LF1['mutation_info_orig'] = gene_LF1['mutation_info'] + # gene_LF1['mutation_info_v1'] = gene_LF1['mutation_info'] + # gene_LF1['mutation_info'].value_counts() + #%% Inspect ambiguous muts + #===================================== + # Now create a new df that will have: + # ambiguous muts + # mutation_info + # revised mutation_info + # The revised label is based on value_counts + # for mutaiton_info. The corresponding mutation_info: + # label is chosen that corresponds to the max of value counts + #===================================== + ambig_muts_rev_df = pd.DataFrame() + changes_val = [] + changes_dict = {} + + common_muts + gene_LF1['mutation'].head() + #common_muts_lower = list((map(lambda x: x.lower(), common_muts))) + #common_muts_lower + + for i in common_muts: + #for i in common_muts_lower: + #print(i) + temp_df = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info_orig']] + temp_df + # DANGER: ASSUMES TWO STATES ONLY and that value_counts sorts by descending + max_info_table = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info_orig']].value_counts() + max_info_table + revised_label = max_info_table[[0]].index[0][1] # max value_count + old_label = max_info_table[[1]].index[0][1] # min value_count + print('\nAmbiguous mutation labels...' + , '\nSetting mutation_info for', i, 'to', revised_label) + temp_df['mutation_info_REV'] = np.where( (temp_df['mutation_info_orig'] == old_label) + , revised_label + , temp_df['mutation_info_orig']) + ambig_muts_rev_df = pd.concat([ambig_muts_rev_df,temp_df]) + f = max_info_table[[1]] + old_label_count = f[[0]][0] + changes_val.append(old_label_count) + cc_dict = f.to_dict() + changes_dict.update(cc_dict) + + ambig_muts_rev_df['mutation_info_REV'].value_counts() + ambig_muts_rev_df['mutation_info_orig'].value_counts() + changes_val + changes_total = sum(changes_val) + changes_dict + n_changes = sum(changes_dict.values()) + #%% OUTFILE 1, write file: ambiguous muts and ambiguous mut counts + #================== + # ambiguous muts + #================== + #dr_muts.XXX_csvXXXX('dr_muts.csv', header = True) + #other_muts.XXXX_csvXXX('other_muts.csv', header = True) + if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0: + out_filename_ambig_muts = gene.lower() + '_ambiguous_muts.csv' + outfile_ambig_muts = outdir + '/' + out_filename_ambig_muts + print('\n----------------------------------' + , '\nWriting file: ambiguous muts' + , '\n----------------------------------' + , '\nFilename:', outfile_ambig_muts) + inspect = gene_LF1[gene_LF1['mutation'].isin(common_muts)] + inspect.to_csv(outfile_ambig_muts, index = True) + + print('Finished writing:', out_filename_ambig_muts + , '\nNo. of rows:', len(inspect) + , '\nNo. of cols:', len(inspect.columns) + , '\nNo. of rows = no. of samples with the ambiguous muts present:' + , dr_muts.isin(other_muts).sum() + other_muts.isin(dr_muts).sum() + , '\n=============================================================') + del(out_filename_ambig_muts) + + #%% OUTFILE 2, write file: ambiguous mut counts + #====================== + # ambiguous mut counts + #====================== + out_filename_ambig_mut_counts = gene.lower() + '_ambiguous_mut_counts.csv' + outfile_ambig_mut_counts = outdir + '/' + out_filename_ambig_mut_counts + print('\n----------------------------------' + , '\nWriting file: ambiguous mut counts' + , '\n----------------------------------' + , '\nFilename:', outfile_ambig_mut_counts) + + ambiguous_muts_value_counts.to_csv(outfile_ambig_mut_counts, index = True) + #%% FIXME: Add sanity check to make sure you can add value_count checks + #%% Resolving ambiguous muts: Merging ambiguous muts with the revised annotations + #================= + # Merge ambig muts + # with gene_LF1 + #=================== + ambig_muts_rev_df.index + gene_LF1.index + all(ambig_muts_rev_df.index.isin(gene_LF1.index)) + + any(gene_LF1.index.isin(ambig_muts_rev_df.index)) + # if(gene_LF1.index.unique().isin(ambig_muts_rev_df.index).sum() == len(ambig_muts_rev_df)): + if(gene_LF1.index.unique().isin(ambig_muts_rev_df.index).sum() == ambig_muts_rev_df.index.nunique()): -ambiguous_muts_value_counts.to_csv(outfile_ambig_mut_counts, index = True) -#%% FIXME: Add sanity check to make sure you can add value_count checks -#%% Resolving ambiguous muts: Merging ambiguous muts with the revised annotations -#================= -# Merge ambig muts -# with gene_LF1 -#=================== -ambig_muts_rev_df.index -gene_LF1.index -all(ambig_muts_rev_df.index.isin(gene_LF1.index)) + print('\nPASS: ambiguous mut indices present in gene_LF1. Prepare to merge...') + else: + sys.exit('\nFAIL:ambiguous mut indices MISmatch. Check section Resolving ambiguous muts') + + ########################################################################## + for index, row in ambig_muts_rev_df.iterrows(): + curr_mut = row['mutation'] + curr_rev = row['mutation_info_REV'] + print('\n=====\nAmbiguous Mutation: index:', index, '\nmutation:', curr_mut, '\nNew:', curr_rev, '\n=====\n' ) + print('\n-----\nReplacing original: index:', index, '\nmutation: ' + , gene_LF1.loc[index,'mutation'] + , '\nmutation_info to replace:' + , gene_LF1.loc[index,'mutation_info'] + , '\nwith:', curr_rev, '\n-----') + replacement_row=(gene_LF1.index==index) & (gene_LF1['mutation'] == curr_mut) + gene_LF1.loc[replacement_row, 'mutation_info'] = curr_rev + + ########################################################################### + + gene_LF1['mutation_info_orig'].value_counts() + gene_LF1['mutation_info_v1'].value_counts() + gene_LF1['mutation_info'].value_counts() -any(gene_LF1.index.isin(ambig_muts_rev_df.index)) -if(gene_LF1.index.unique().isin(ambig_muts_rev_df.index).sum() == len(ambig_muts_rev_df)): - print('\nPASS: ambiguous mut indices present in gene_LF1. Prepare to merge...') + # Sanity check1: if there are still any ambiguous muts + #muts_split_rev = list(gene_LF1.groupby('mutation_info_v1')) + muts_split_rev = list(gene_LF1.groupby('mutation_info')) + dr_muts_rev = muts_split_rev[0][1].mutation + other_muts_rev = muts_split_rev[1][1].mutation + print('splitting muts by mut_info:', muts_split_rev) + print('no.of dr_muts samples:', len(dr_muts_rev)) + print('no. of other_muts samples', len(other_muts_rev)) + + if not dr_muts_rev.isin(other_muts_rev).sum() & other_muts_rev.isin(dr_muts_rev).sum() > 0: + print('\nAmbiguous muts corrected. Proceeding with downstream analysis') + else: + print('\nAmbiguous muts NOT corrected. Quitting!') + sys.exit() else: - sys.exit('\nFAIL:ambiguous mut indices MISmatch. Check section Resolving ambiguous muts') - -#gene_LF1.loc[ambig_muts_rev_df.index, 'mutation_info_v1'] = ambig_muts_rev_df['mutation_info_REV'] -gene_LF1.loc[ambig_muts_rev_df.index, 'mutation_info'] = ambig_muts_rev_df['mutation_info_REV'] - -gene_LF1['mutation_info_orig'].value_counts() -gene_LF1['mutation_info_v1'].value_counts() - -# Sanity check1: if there are still any ambiguous muts -#muts_split_rev = list(gene_LF1.groupby('mutation_info_v1')) -muts_split_rev = list(gene_LF1.groupby('mutation_info')) -dr_muts_rev = muts_split_rev[0][1].mutation -other_muts_rev = muts_split_rev[1][1].mutation -print('splitting muts by mut_info:', muts_split_rev) -print('no.of dr_muts samples:', len(dr_muts_rev)) -print('no. of other_muts samples', len(other_muts_rev)) - -if not dr_muts_rev.isin(other_muts_rev).sum() & other_muts_rev.isin(dr_muts_rev).sum() > 0: - print('\nAmbiguous muts corrected. Proceeding with downstream analysis') -else: - print('\nAmbiguous muts NOT corrected. Quitting!') - sys.exit() + print('Mutations ARE NOT ambiguous, proceeding to downstream analyses') #gene_LF1['mutation_info_v1'].value_counts() gene_LF1['mutation_info'].value_counts() @@ -1649,7 +1676,7 @@ lf_lin_split['lineage_corrupt'].value_counts() search = ";" # count of occurrence of ";" in dr_muts_col: No.of semicolons + 1 is no. of rows created * occurence -count_df_lin = gene_LF3_ColsSel[['lineage']] +count_df_lin = gene_LF3_ColsSel[['lineage']].copy() count_df_lin['lineage_semicolon_count'] = gene_LF3_ColsSel.loc[:, 'lineage'].str.count(search, re.I) lin_sc_C = count_df_lin['lineage_semicolon_count'].value_counts().reset_index() lin_sc_C @@ -1708,7 +1735,7 @@ lf_lin_split['lineage_multimode'].value_counts() #%% Select only the columns you want to merge from lf_lin_split lf_lin_split.columns lf_lin_split_ColSel = lf_lin_split[['lineage_corrupt_list','lineage_corrupt_count' - , 'lineage_corrupt_ucount' ,'lineage_ulist', 'lineage_multimode']] + , 'lineage_corrupt_ucount' ,'lineage_ulist', 'lineage_multimode']].copy() lf_lin_split_ColSel.columns lf_lin_split_ColSel.rename(columns = {'lineage_corrupt_list' : 'lineage_list_all' @@ -1951,28 +1978,29 @@ print('\n============================================' , '\n' , '\nTotal no. of unique nsSNPs [check1: length of snps_only]:', len(snps_only) - - , '\nTotal no.of unique dr muts:' , dr_muts_rev.nunique() - , '\nTotal no.of unique other muts:' , other_muts_rev.nunique() - , '\nTotal no. of unique nsSNPs [check2: dr_muts + other_muts]:', dr_muts_rev.nunique()+other_muts_rev.nunique() - , '\nTotal no.of unique nSNSPs [check3, gene_LF4]:', gene_LF4['mutationinformation'].nunique() , '\nTotal no.of unique positions associated with missense muts:', gene_LF4['position'].nunique() , '\nTotal no. of samples with nsSNPs:', len(gene_LF4) , '\nTotal no. of unique sample ids with nsSNPs:', gene_LF4['id'].nunique() ) - -if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0: - print('\nTotal no.of samples with ambiguous muts:', len(inspect) - #, '\nTotal no.of unique ambiguous muts:', len(common_muts) - , '\nTotal no.of unique ambiguous muts:', inspect['mutation'].nunique() - , '\n=============================================================' - , '\nPost resolving ambiguity\n' - , ambig_muts_rev_df['mutation_info_REV'].value_counts()) +if 'common_muts' in globals(): + print('\nTotal no.of unique dr muts:' , dr_muts_rev.nunique() + , '\nTotal no.of unique other muts:' , other_muts_rev.nunique() + , '\nTotal no. of unique nsSNPs [check2: dr_muts + other_muts]:', dr_muts_rev.nunique()+other_muts_rev.nunique() + ) + if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0: + print('\nTotal no.of samples with ambiguous muts:', len(inspect) + #, '\nTotal no.of unique ambiguous muts:', len(common_muts) + , '\nTotal no.of unique ambiguous muts:', inspect['mutation'].nunique() + , '\n=============================================================' + , '\nPost resolving ambiguity\n' + , ambig_muts_rev_df['mutation_info_REV'].value_counts()) +else: + print('No ambiguous muts present, hence no summary') print('\n=============================================================' , '\n=============================================================' - , '\###############################\n' + , '\n###############################\n' , '\nNumbers for ML workflows...' , '\n###############################\n'