updated data extraction to ensure genes without common mutations and duplicate indices can run from the cmd

This commit is contained in:
Tanushree Tunstall 2022-05-05 13:35:24 +01:00
parent 5429b8fed7
commit e54ae877a8

View file

@ -376,7 +376,7 @@ print('===========================================================\n'
search = ";" search = ";"
# count of occurrence of ";" in dr_muts_col: No.of semicolons + 1 is no. of rows created * occurence # count of occurrence of ";" in dr_muts_col: No.of semicolons + 1 is no. of rows created * occurence
count_df_dr = meta_gene_dr[['id', dr_muts_col]] count_df_dr = meta_gene_dr[['id', dr_muts_col]].copy()
count_df_dr['dr_semicolon_count'] = meta_gene_dr.loc[:, dr_muts_col].str.count(search, re.I) count_df_dr['dr_semicolon_count'] = meta_gene_dr.loc[:, dr_muts_col].str.count(search, re.I)
dr_sc_C = count_df_dr['dr_semicolon_count'].value_counts().reset_index() dr_sc_C = count_df_dr['dr_semicolon_count'].value_counts().reset_index()
dr_sc_C dr_sc_C
@ -502,7 +502,7 @@ if other_muts_col in dr_df.columns:
#%% Important sanity checks: other muts column for tidy split(), nsSNPs, etc. #%% Important sanity checks: other muts column for tidy split(), nsSNPs, etc.
# Split based on semi colon on other_muts_col # Split based on semi colon on other_muts_col
# count of occurrence of ";" in other_muts_col: No.of semicolons + 1 is no. of rows created * occurence # count of occurrence of ";" in other_muts_col: No.of semicolons + 1 is no. of rows created * occurence
count_df_other = meta_gene_other[['id', other_muts_col]] count_df_other = meta_gene_other[['id', other_muts_col]].copy()
count_df_other['other_semicolon_count'] = meta_gene_other.loc[:, other_muts_col].str.count(search, re.I) count_df_other['other_semicolon_count'] = meta_gene_other.loc[:, other_muts_col].str.count(search, re.I)
other_sc_C = count_df_other['other_semicolon_count'].value_counts().reset_index() other_sc_C = count_df_other['other_semicolon_count'].value_counts().reset_index()
other_sc_C other_sc_C
@ -790,7 +790,8 @@ print('Length of gene_LF0:', len(gene_LF0)
, '\nThis should be what we need. But just double checking and extracting nsSNP for', gene , '\nThis should be what we need. But just double checking and extracting nsSNP for', gene
, '\nfrom LF0 (concatenated data) using case insensitive regex match:', nssnp_match) , '\nfrom LF0 (concatenated data) using case insensitive regex match:', nssnp_match)
gene_LF1 = gene_LF0[gene_LF0['mutation'].str.contains(nssnp_match, regex = True, case = False)] #gene_LF1 = gene_LF0[gene_LF0['mutation'].str.contains(nssnp_match, regex = True, case = False)]
gene_LF1 = gene_LF0[gene_LF0['mutation'].str.contains(nssnp_match, regex = True, case = False)].copy()
if len(gene_LF0) == len(gene_LF1): if len(gene_LF0) == len(gene_LF1):
print('PASS: length of gene_LF0 and gene_LF1 match', print('PASS: length of gene_LF0 and gene_LF1 match',
@ -859,8 +860,7 @@ other_muts = muts_split[1][1].mutation
print('splitting muts by mut_info:', muts_split) print('splitting muts by mut_info:', muts_split)
print('no.of dr_muts samples:', len(dr_muts)) print('no.of dr_muts samples:', len(dr_muts))
print('no. of other_muts samples', len(other_muts)) print('no. of other_muts samples', len(other_muts))
#%% Ambiguous muts #%% Ambiguous muts: the same mutation cannot be classed as a drug AND 'others'
# IMPORTANT: The same mutation cannot be classed as a drug AND 'others'
if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0: if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
print('WARNING: Ambiguous muts detected in dr_ and other_ mutation category' print('WARNING: Ambiguous muts detected in dr_ and other_ mutation category'
, '\n===============================================================') , '\n===============================================================')
@ -894,13 +894,9 @@ if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
print('\n===========================================================') print('\n===========================================================')
else: else:
#sys.exit('Error: ambiguous muts present, but extraction failed. Debug!') #sys.exit('Error: ambiguous muts present, but extraction failed. Debug!')
print('No: ambiguous muts present') print('Ambiguous muts are NOT present')
#%% Ambiguous muts: revised annotation for mutation_info
ambiguous_muts_df = gene_LF1[gene_LF1['mutation'].isin(common_muts)]
ambiguous_muts_value_counts = ambiguous_muts_df.groupby('mutation')['mutation_info'].value_counts()
ambiguous_muts_value_counts
#%% DOES NOT depend on common_muts
gene_LF1_orig = gene_LF1.copy() gene_LF1_orig = gene_LF1.copy()
gene_LF1_orig.equals(gene_LF1) gene_LF1_orig.equals(gene_LF1)
@ -908,28 +904,41 @@ gene_LF1_orig.equals(gene_LF1)
gene_LF1['mutation_info_orig'] = gene_LF1['mutation_info'] gene_LF1['mutation_info_orig'] = gene_LF1['mutation_info']
gene_LF1['mutation_info_v1'] = gene_LF1['mutation_info'] gene_LF1['mutation_info_v1'] = gene_LF1['mutation_info']
gene_LF1['mutation_info'].value_counts() gene_LF1['mutation_info'].value_counts()
#%% Inspect ambiguous muts
#===================================== #%% Ambiguous muts: revised annotation for mutation_info
# Now create a new df that will have: if 'common_muts' in globals():
ambiguous_muts_df = gene_LF1[gene_LF1['mutation'].isin(common_muts)]
ambiguous_muts_value_counts = ambiguous_muts_df.groupby('mutation')['mutation_info'].value_counts()
ambiguous_muts_value_counts
# gene_LF1_orig = gene_LF1.copy()
# gene_LF1_orig.equals(gene_LF1)
# # copy the old columns for checking
# gene_LF1['mutation_info_orig'] = gene_LF1['mutation_info']
# gene_LF1['mutation_info_v1'] = gene_LF1['mutation_info']
# gene_LF1['mutation_info'].value_counts()
#%% Inspect ambiguous muts
#=====================================
# Now create a new df that will have:
# ambiguous muts # ambiguous muts
# mutation_info # mutation_info
# revised mutation_info # revised mutation_info
# The revised label is based on value_counts # The revised label is based on value_counts
# for mutaiton_info. The corresponding mutation_info: # for mutaiton_info. The corresponding mutation_info:
# label is chosen that corresponds to the max of value counts # label is chosen that corresponds to the max of value counts
#===================================== #=====================================
ambig_muts_rev_df = pd.DataFrame() ambig_muts_rev_df = pd.DataFrame()
changes_val = [] changes_val = []
changes_dict = {} changes_dict = {}
##BROKENNNN!!!! common_muts
common_muts gene_LF1['mutation'].head()
gene_LF1['mutation'].head() #common_muts_lower = list((map(lambda x: x.lower(), common_muts)))
#common_muts_lower = list((map(lambda x: x.lower(), common_muts))) #common_muts_lower
#common_muts_lower
for i in common_muts: for i in common_muts:
#for i in common_muts_lower: #for i in common_muts_lower:
#print(i) #print(i)
temp_df = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info_orig']] temp_df = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info_orig']]
temp_df temp_df
@ -950,18 +959,19 @@ for i in common_muts:
cc_dict = f.to_dict() cc_dict = f.to_dict()
changes_dict.update(cc_dict) changes_dict.update(cc_dict)
ambig_muts_rev_df['mutation_info_REV'].value_counts() ambig_muts_rev_df['mutation_info_REV'].value_counts()
ambig_muts_rev_df['mutation_info_orig'].value_counts() ambig_muts_rev_df['mutation_info_orig'].value_counts()
changes_val changes_val
changes_total = sum(changes_val) changes_total = sum(changes_val)
changes_dict changes_dict
#%% OUTFILE 1, write file: ambiguous muts and ambiguous mut counts n_changes = sum(changes_dict.values())
#================== #%% OUTFILE 1, write file: ambiguous muts and ambiguous mut counts
# ambiguous muts #==================
#================== # ambiguous muts
#dr_muts.XXX_csvXXXX('dr_muts.csv', header = True) #==================
#other_muts.XXXX_csvXXX('other_muts.csv', header = True) #dr_muts.XXX_csvXXXX('dr_muts.csv', header = True)
if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0: #other_muts.XXXX_csvXXX('other_muts.csv', header = True)
if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
out_filename_ambig_muts = gene.lower() + '_ambiguous_muts.csv' out_filename_ambig_muts = gene.lower() + '_ambiguous_muts.csv'
outfile_ambig_muts = outdir + '/' + out_filename_ambig_muts outfile_ambig_muts = outdir + '/' + out_filename_ambig_muts
print('\n----------------------------------' print('\n----------------------------------'
@ -979,54 +989,71 @@ if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
, '\n=============================================================') , '\n=============================================================')
del(out_filename_ambig_muts) del(out_filename_ambig_muts)
#%% OUTFILE 2, write file: ambiguous mut counts #%% OUTFILE 2, write file: ambiguous mut counts
#====================== #======================
# ambiguous mut counts # ambiguous mut counts
#====================== #======================
out_filename_ambig_mut_counts = gene.lower() + '_ambiguous_mut_counts.csv' out_filename_ambig_mut_counts = gene.lower() + '_ambiguous_mut_counts.csv'
outfile_ambig_mut_counts = outdir + '/' + out_filename_ambig_mut_counts outfile_ambig_mut_counts = outdir + '/' + out_filename_ambig_mut_counts
print('\n----------------------------------' print('\n----------------------------------'
, '\nWriting file: ambiguous mut counts' , '\nWriting file: ambiguous mut counts'
, '\n----------------------------------' , '\n----------------------------------'
, '\nFilename:', outfile_ambig_mut_counts) , '\nFilename:', outfile_ambig_mut_counts)
ambiguous_muts_value_counts.to_csv(outfile_ambig_mut_counts, index = True) ambiguous_muts_value_counts.to_csv(outfile_ambig_mut_counts, index = True)
#%% FIXME: Add sanity check to make sure you can add value_count checks #%% FIXME: Add sanity check to make sure you can add value_count checks
#%% Resolving ambiguous muts: Merging ambiguous muts with the revised annotations #%% Resolving ambiguous muts: Merging ambiguous muts with the revised annotations
#================= #=================
# Merge ambig muts # Merge ambig muts
# with gene_LF1 # with gene_LF1
#=================== #===================
ambig_muts_rev_df.index ambig_muts_rev_df.index
gene_LF1.index gene_LF1.index
all(ambig_muts_rev_df.index.isin(gene_LF1.index)) all(ambig_muts_rev_df.index.isin(gene_LF1.index))
any(gene_LF1.index.isin(ambig_muts_rev_df.index))
# if(gene_LF1.index.unique().isin(ambig_muts_rev_df.index).sum() == len(ambig_muts_rev_df)):
if(gene_LF1.index.unique().isin(ambig_muts_rev_df.index).sum() == ambig_muts_rev_df.index.nunique()):
any(gene_LF1.index.isin(ambig_muts_rev_df.index))
if(gene_LF1.index.unique().isin(ambig_muts_rev_df.index).sum() == len(ambig_muts_rev_df)):
print('\nPASS: ambiguous mut indices present in gene_LF1. Prepare to merge...') print('\nPASS: ambiguous mut indices present in gene_LF1. Prepare to merge...')
else: else:
sys.exit('\nFAIL:ambiguous mut indices MISmatch. Check section Resolving ambiguous muts') sys.exit('\nFAIL:ambiguous mut indices MISmatch. Check section Resolving ambiguous muts')
#gene_LF1.loc[ambig_muts_rev_df.index, 'mutation_info_v1'] = ambig_muts_rev_df['mutation_info_REV'] ##########################################################################
gene_LF1.loc[ambig_muts_rev_df.index, 'mutation_info'] = ambig_muts_rev_df['mutation_info_REV'] for index, row in ambig_muts_rev_df.iterrows():
curr_mut = row['mutation']
curr_rev = row['mutation_info_REV']
print('\n=====\nAmbiguous Mutation: index:', index, '\nmutation:', curr_mut, '\nNew:', curr_rev, '\n=====\n' )
print('\n-----\nReplacing original: index:', index, '\nmutation: '
, gene_LF1.loc[index,'mutation']
, '\nmutation_info to replace:'
, gene_LF1.loc[index,'mutation_info']
, '\nwith:', curr_rev, '\n-----')
replacement_row=(gene_LF1.index==index) & (gene_LF1['mutation'] == curr_mut)
gene_LF1.loc[replacement_row, 'mutation_info'] = curr_rev
gene_LF1['mutation_info_orig'].value_counts() ###########################################################################
gene_LF1['mutation_info_v1'].value_counts()
# Sanity check1: if there are still any ambiguous muts gene_LF1['mutation_info_orig'].value_counts()
#muts_split_rev = list(gene_LF1.groupby('mutation_info_v1')) gene_LF1['mutation_info_v1'].value_counts()
muts_split_rev = list(gene_LF1.groupby('mutation_info')) gene_LF1['mutation_info'].value_counts()
dr_muts_rev = muts_split_rev[0][1].mutation
other_muts_rev = muts_split_rev[1][1].mutation
print('splitting muts by mut_info:', muts_split_rev)
print('no.of dr_muts samples:', len(dr_muts_rev))
print('no. of other_muts samples', len(other_muts_rev))
if not dr_muts_rev.isin(other_muts_rev).sum() & other_muts_rev.isin(dr_muts_rev).sum() > 0: # Sanity check1: if there are still any ambiguous muts
#muts_split_rev = list(gene_LF1.groupby('mutation_info_v1'))
muts_split_rev = list(gene_LF1.groupby('mutation_info'))
dr_muts_rev = muts_split_rev[0][1].mutation
other_muts_rev = muts_split_rev[1][1].mutation
print('splitting muts by mut_info:', muts_split_rev)
print('no.of dr_muts samples:', len(dr_muts_rev))
print('no. of other_muts samples', len(other_muts_rev))
if not dr_muts_rev.isin(other_muts_rev).sum() & other_muts_rev.isin(dr_muts_rev).sum() > 0:
print('\nAmbiguous muts corrected. Proceeding with downstream analysis') print('\nAmbiguous muts corrected. Proceeding with downstream analysis')
else: else:
print('\nAmbiguous muts NOT corrected. Quitting!') print('\nAmbiguous muts NOT corrected. Quitting!')
sys.exit() sys.exit()
else:
print('Mutations ARE NOT ambiguous, proceeding to downstream analyses')
#gene_LF1['mutation_info_v1'].value_counts() #gene_LF1['mutation_info_v1'].value_counts()
gene_LF1['mutation_info'].value_counts() gene_LF1['mutation_info'].value_counts()
@ -1649,7 +1676,7 @@ lf_lin_split['lineage_corrupt'].value_counts()
search = ";" search = ";"
# count of occurrence of ";" in dr_muts_col: No.of semicolons + 1 is no. of rows created * occurence # count of occurrence of ";" in dr_muts_col: No.of semicolons + 1 is no. of rows created * occurence
count_df_lin = gene_LF3_ColsSel[['lineage']] count_df_lin = gene_LF3_ColsSel[['lineage']].copy()
count_df_lin['lineage_semicolon_count'] = gene_LF3_ColsSel.loc[:, 'lineage'].str.count(search, re.I) count_df_lin['lineage_semicolon_count'] = gene_LF3_ColsSel.loc[:, 'lineage'].str.count(search, re.I)
lin_sc_C = count_df_lin['lineage_semicolon_count'].value_counts().reset_index() lin_sc_C = count_df_lin['lineage_semicolon_count'].value_counts().reset_index()
lin_sc_C lin_sc_C
@ -1708,7 +1735,7 @@ lf_lin_split['lineage_multimode'].value_counts()
#%% Select only the columns you want to merge from lf_lin_split #%% Select only the columns you want to merge from lf_lin_split
lf_lin_split.columns lf_lin_split.columns
lf_lin_split_ColSel = lf_lin_split[['lineage_corrupt_list','lineage_corrupt_count' lf_lin_split_ColSel = lf_lin_split[['lineage_corrupt_list','lineage_corrupt_count'
, 'lineage_corrupt_ucount' ,'lineage_ulist', 'lineage_multimode']] , 'lineage_corrupt_ucount' ,'lineage_ulist', 'lineage_multimode']].copy()
lf_lin_split_ColSel.columns lf_lin_split_ColSel.columns
lf_lin_split_ColSel.rename(columns = {'lineage_corrupt_list' : 'lineage_list_all' lf_lin_split_ColSel.rename(columns = {'lineage_corrupt_list' : 'lineage_list_all'
@ -1951,28 +1978,29 @@ print('\n============================================'
, '\n' , '\n'
, '\nTotal no. of unique nsSNPs [check1: length of snps_only]:', len(snps_only) , '\nTotal no. of unique nsSNPs [check1: length of snps_only]:', len(snps_only)
, '\nTotal no.of unique dr muts:' , dr_muts_rev.nunique()
, '\nTotal no.of unique other muts:' , other_muts_rev.nunique()
, '\nTotal no. of unique nsSNPs [check2: dr_muts + other_muts]:', dr_muts_rev.nunique()+other_muts_rev.nunique()
, '\nTotal no.of unique nSNSPs [check3, gene_LF4]:', gene_LF4['mutationinformation'].nunique() , '\nTotal no.of unique nSNSPs [check3, gene_LF4]:', gene_LF4['mutationinformation'].nunique()
, '\nTotal no.of unique positions associated with missense muts:', gene_LF4['position'].nunique() , '\nTotal no.of unique positions associated with missense muts:', gene_LF4['position'].nunique()
, '\nTotal no. of samples with nsSNPs:', len(gene_LF4) , '\nTotal no. of samples with nsSNPs:', len(gene_LF4)
, '\nTotal no. of unique sample ids with nsSNPs:', gene_LF4['id'].nunique() , '\nTotal no. of unique sample ids with nsSNPs:', gene_LF4['id'].nunique()
) )
if 'common_muts' in globals():
if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0: print('\nTotal no.of unique dr muts:' , dr_muts_rev.nunique()
, '\nTotal no.of unique other muts:' , other_muts_rev.nunique()
, '\nTotal no. of unique nsSNPs [check2: dr_muts + other_muts]:', dr_muts_rev.nunique()+other_muts_rev.nunique()
)
if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
print('\nTotal no.of samples with ambiguous muts:', len(inspect) print('\nTotal no.of samples with ambiguous muts:', len(inspect)
#, '\nTotal no.of unique ambiguous muts:', len(common_muts) #, '\nTotal no.of unique ambiguous muts:', len(common_muts)
, '\nTotal no.of unique ambiguous muts:', inspect['mutation'].nunique() , '\nTotal no.of unique ambiguous muts:', inspect['mutation'].nunique()
, '\n=============================================================' , '\n============================================================='
, '\nPost resolving ambiguity\n' , '\nPost resolving ambiguity\n'
, ambig_muts_rev_df['mutation_info_REV'].value_counts()) , ambig_muts_rev_df['mutation_info_REV'].value_counts())
else:
print('No ambiguous muts present, hence no summary')
print('\n=============================================================' print('\n============================================================='
, '\n=============================================================' , '\n============================================================='
, '\###############################\n' , '\n###############################\n'
, '\nNumbers for ML workflows...' , '\nNumbers for ML workflows...'
, '\n###############################\n' , '\n###############################\n'