updated data extraction to ensure genes without common mutations and duplicate indices can run from the cmd
This commit is contained in:
parent
5429b8fed7
commit
e54ae877a8
1 changed files with 170 additions and 142 deletions
|
@ -376,7 +376,7 @@ print('===========================================================\n'
|
||||||
search = ";"
|
search = ";"
|
||||||
|
|
||||||
# count of occurrence of ";" in dr_muts_col: No.of semicolons + 1 is no. of rows created * occurence
|
# count of occurrence of ";" in dr_muts_col: No.of semicolons + 1 is no. of rows created * occurence
|
||||||
count_df_dr = meta_gene_dr[['id', dr_muts_col]]
|
count_df_dr = meta_gene_dr[['id', dr_muts_col]].copy()
|
||||||
count_df_dr['dr_semicolon_count'] = meta_gene_dr.loc[:, dr_muts_col].str.count(search, re.I)
|
count_df_dr['dr_semicolon_count'] = meta_gene_dr.loc[:, dr_muts_col].str.count(search, re.I)
|
||||||
dr_sc_C = count_df_dr['dr_semicolon_count'].value_counts().reset_index()
|
dr_sc_C = count_df_dr['dr_semicolon_count'].value_counts().reset_index()
|
||||||
dr_sc_C
|
dr_sc_C
|
||||||
|
@ -502,7 +502,7 @@ if other_muts_col in dr_df.columns:
|
||||||
#%% Important sanity checks: other muts column for tidy split(), nsSNPs, etc.
|
#%% Important sanity checks: other muts column for tidy split(), nsSNPs, etc.
|
||||||
# Split based on semi colon on other_muts_col
|
# Split based on semi colon on other_muts_col
|
||||||
# count of occurrence of ";" in other_muts_col: No.of semicolons + 1 is no. of rows created * occurence
|
# count of occurrence of ";" in other_muts_col: No.of semicolons + 1 is no. of rows created * occurence
|
||||||
count_df_other = meta_gene_other[['id', other_muts_col]]
|
count_df_other = meta_gene_other[['id', other_muts_col]].copy()
|
||||||
count_df_other['other_semicolon_count'] = meta_gene_other.loc[:, other_muts_col].str.count(search, re.I)
|
count_df_other['other_semicolon_count'] = meta_gene_other.loc[:, other_muts_col].str.count(search, re.I)
|
||||||
other_sc_C = count_df_other['other_semicolon_count'].value_counts().reset_index()
|
other_sc_C = count_df_other['other_semicolon_count'].value_counts().reset_index()
|
||||||
other_sc_C
|
other_sc_C
|
||||||
|
@ -790,7 +790,8 @@ print('Length of gene_LF0:', len(gene_LF0)
|
||||||
, '\nThis should be what we need. But just double checking and extracting nsSNP for', gene
|
, '\nThis should be what we need. But just double checking and extracting nsSNP for', gene
|
||||||
, '\nfrom LF0 (concatenated data) using case insensitive regex match:', nssnp_match)
|
, '\nfrom LF0 (concatenated data) using case insensitive regex match:', nssnp_match)
|
||||||
|
|
||||||
gene_LF1 = gene_LF0[gene_LF0['mutation'].str.contains(nssnp_match, regex = True, case = False)]
|
#gene_LF1 = gene_LF0[gene_LF0['mutation'].str.contains(nssnp_match, regex = True, case = False)]
|
||||||
|
gene_LF1 = gene_LF0[gene_LF0['mutation'].str.contains(nssnp_match, regex = True, case = False)].copy()
|
||||||
|
|
||||||
if len(gene_LF0) == len(gene_LF1):
|
if len(gene_LF0) == len(gene_LF1):
|
||||||
print('PASS: length of gene_LF0 and gene_LF1 match',
|
print('PASS: length of gene_LF0 and gene_LF1 match',
|
||||||
|
@ -859,8 +860,7 @@ other_muts = muts_split[1][1].mutation
|
||||||
print('splitting muts by mut_info:', muts_split)
|
print('splitting muts by mut_info:', muts_split)
|
||||||
print('no.of dr_muts samples:', len(dr_muts))
|
print('no.of dr_muts samples:', len(dr_muts))
|
||||||
print('no. of other_muts samples', len(other_muts))
|
print('no. of other_muts samples', len(other_muts))
|
||||||
#%% Ambiguous muts
|
#%% Ambiguous muts: the same mutation cannot be classed as a drug AND 'others'
|
||||||
# IMPORTANT: The same mutation cannot be classed as a drug AND 'others'
|
|
||||||
if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
|
if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
|
||||||
print('WARNING: Ambiguous muts detected in dr_ and other_ mutation category'
|
print('WARNING: Ambiguous muts detected in dr_ and other_ mutation category'
|
||||||
, '\n===============================================================')
|
, '\n===============================================================')
|
||||||
|
@ -894,139 +894,166 @@ if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
|
||||||
print('\n===========================================================')
|
print('\n===========================================================')
|
||||||
else:
|
else:
|
||||||
#sys.exit('Error: ambiguous muts present, but extraction failed. Debug!')
|
#sys.exit('Error: ambiguous muts present, but extraction failed. Debug!')
|
||||||
print('No: ambiguous muts present')
|
print('Ambiguous muts are NOT present')
|
||||||
|
|
||||||
#%% Ambiguous muts: revised annotation for mutation_info
|
|
||||||
ambiguous_muts_df = gene_LF1[gene_LF1['mutation'].isin(common_muts)]
|
|
||||||
ambiguous_muts_value_counts = ambiguous_muts_df.groupby('mutation')['mutation_info'].value_counts()
|
|
||||||
ambiguous_muts_value_counts
|
|
||||||
|
|
||||||
|
#%% DOES NOT depend on common_muts
|
||||||
gene_LF1_orig = gene_LF1.copy()
|
gene_LF1_orig = gene_LF1.copy()
|
||||||
gene_LF1_orig.equals(gene_LF1)
|
gene_LF1_orig.equals(gene_LF1)
|
||||||
|
|
||||||
# copy the old columns for checking
|
# copy the old columns for checking
|
||||||
gene_LF1['mutation_info_orig'] = gene_LF1['mutation_info']
|
gene_LF1['mutation_info_orig'] = gene_LF1['mutation_info']
|
||||||
gene_LF1['mutation_info_v1'] = gene_LF1['mutation_info']
|
gene_LF1['mutation_info_v1'] = gene_LF1['mutation_info']
|
||||||
gene_LF1['mutation_info'].value_counts()
|
gene_LF1['mutation_info'].value_counts()
|
||||||
#%% Inspect ambiguous muts
|
|
||||||
#=====================================
|
|
||||||
# Now create a new df that will have:
|
|
||||||
# ambiguous muts
|
|
||||||
# mutation_info
|
|
||||||
# revised mutation_info
|
|
||||||
# The revised label is based on value_counts
|
|
||||||
# for mutaiton_info. The corresponding mutation_info:
|
|
||||||
# label is chosen that corresponds to the max of value counts
|
|
||||||
#=====================================
|
|
||||||
ambig_muts_rev_df = pd.DataFrame()
|
|
||||||
changes_val = []
|
|
||||||
changes_dict = {}
|
|
||||||
|
|
||||||
##BROKENNNN!!!!
|
#%% Ambiguous muts: revised annotation for mutation_info
|
||||||
common_muts
|
if 'common_muts' in globals():
|
||||||
gene_LF1['mutation'].head()
|
ambiguous_muts_df = gene_LF1[gene_LF1['mutation'].isin(common_muts)]
|
||||||
#common_muts_lower = list((map(lambda x: x.lower(), common_muts)))
|
ambiguous_muts_value_counts = ambiguous_muts_df.groupby('mutation')['mutation_info'].value_counts()
|
||||||
#common_muts_lower
|
ambiguous_muts_value_counts
|
||||||
|
|
||||||
for i in common_muts:
|
|
||||||
#for i in common_muts_lower:
|
|
||||||
#print(i)
|
|
||||||
temp_df = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info_orig']]
|
|
||||||
temp_df
|
|
||||||
# DANGER: ASSUMES TWO STATES ONLY and that value_counts sorts by descending
|
|
||||||
max_info_table = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info_orig']].value_counts()
|
|
||||||
max_info_table
|
|
||||||
revised_label = max_info_table[[0]].index[0][1] # max value_count
|
|
||||||
old_label = max_info_table[[1]].index[0][1] # min value_count
|
|
||||||
print('\nAmbiguous mutation labels...'
|
|
||||||
, '\nSetting mutation_info for', i, 'to', revised_label)
|
|
||||||
temp_df['mutation_info_REV'] = np.where( (temp_df['mutation_info_orig'] == old_label)
|
|
||||||
, revised_label
|
|
||||||
, temp_df['mutation_info_orig'])
|
|
||||||
ambig_muts_rev_df = pd.concat([ambig_muts_rev_df,temp_df])
|
|
||||||
f = max_info_table[[1]]
|
|
||||||
old_label_count = f[[0]][0]
|
|
||||||
changes_val.append(old_label_count)
|
|
||||||
cc_dict = f.to_dict()
|
|
||||||
changes_dict.update(cc_dict)
|
|
||||||
|
|
||||||
ambig_muts_rev_df['mutation_info_REV'].value_counts()
|
|
||||||
ambig_muts_rev_df['mutation_info_orig'].value_counts()
|
|
||||||
changes_val
|
|
||||||
changes_total = sum(changes_val)
|
|
||||||
changes_dict
|
|
||||||
#%% OUTFILE 1, write file: ambiguous muts and ambiguous mut counts
|
|
||||||
#==================
|
|
||||||
# ambiguous muts
|
|
||||||
#==================
|
|
||||||
#dr_muts.XXX_csvXXXX('dr_muts.csv', header = True)
|
|
||||||
#other_muts.XXXX_csvXXX('other_muts.csv', header = True)
|
|
||||||
if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
|
|
||||||
out_filename_ambig_muts = gene.lower() + '_ambiguous_muts.csv'
|
|
||||||
outfile_ambig_muts = outdir + '/' + out_filename_ambig_muts
|
|
||||||
print('\n----------------------------------'
|
|
||||||
, '\nWriting file: ambiguous muts'
|
|
||||||
, '\n----------------------------------'
|
|
||||||
, '\nFilename:', outfile_ambig_muts)
|
|
||||||
inspect = gene_LF1[gene_LF1['mutation'].isin(common_muts)]
|
|
||||||
inspect.to_csv(outfile_ambig_muts, index = True)
|
|
||||||
|
|
||||||
print('Finished writing:', out_filename_ambig_muts
|
|
||||||
, '\nNo. of rows:', len(inspect)
|
|
||||||
, '\nNo. of cols:', len(inspect.columns)
|
|
||||||
, '\nNo. of rows = no. of samples with the ambiguous muts present:'
|
|
||||||
, dr_muts.isin(other_muts).sum() + other_muts.isin(dr_muts).sum()
|
|
||||||
, '\n=============================================================')
|
|
||||||
del(out_filename_ambig_muts)
|
|
||||||
|
|
||||||
#%% OUTFILE 2, write file: ambiguous mut counts
|
# gene_LF1_orig = gene_LF1.copy()
|
||||||
#======================
|
# gene_LF1_orig.equals(gene_LF1)
|
||||||
# ambiguous mut counts
|
|
||||||
#======================
|
# # copy the old columns for checking
|
||||||
out_filename_ambig_mut_counts = gene.lower() + '_ambiguous_mut_counts.csv'
|
# gene_LF1['mutation_info_orig'] = gene_LF1['mutation_info']
|
||||||
outfile_ambig_mut_counts = outdir + '/' + out_filename_ambig_mut_counts
|
# gene_LF1['mutation_info_v1'] = gene_LF1['mutation_info']
|
||||||
print('\n----------------------------------'
|
# gene_LF1['mutation_info'].value_counts()
|
||||||
, '\nWriting file: ambiguous mut counts'
|
#%% Inspect ambiguous muts
|
||||||
, '\n----------------------------------'
|
#=====================================
|
||||||
, '\nFilename:', outfile_ambig_mut_counts)
|
# Now create a new df that will have:
|
||||||
|
# ambiguous muts
|
||||||
|
# mutation_info
|
||||||
|
# revised mutation_info
|
||||||
|
# The revised label is based on value_counts
|
||||||
|
# for mutaiton_info. The corresponding mutation_info:
|
||||||
|
# label is chosen that corresponds to the max of value counts
|
||||||
|
#=====================================
|
||||||
|
ambig_muts_rev_df = pd.DataFrame()
|
||||||
|
changes_val = []
|
||||||
|
changes_dict = {}
|
||||||
|
|
||||||
|
common_muts
|
||||||
|
gene_LF1['mutation'].head()
|
||||||
|
#common_muts_lower = list((map(lambda x: x.lower(), common_muts)))
|
||||||
|
#common_muts_lower
|
||||||
|
|
||||||
|
for i in common_muts:
|
||||||
|
#for i in common_muts_lower:
|
||||||
|
#print(i)
|
||||||
|
temp_df = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info_orig']]
|
||||||
|
temp_df
|
||||||
|
# DANGER: ASSUMES TWO STATES ONLY and that value_counts sorts by descending
|
||||||
|
max_info_table = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info_orig']].value_counts()
|
||||||
|
max_info_table
|
||||||
|
revised_label = max_info_table[[0]].index[0][1] # max value_count
|
||||||
|
old_label = max_info_table[[1]].index[0][1] # min value_count
|
||||||
|
print('\nAmbiguous mutation labels...'
|
||||||
|
, '\nSetting mutation_info for', i, 'to', revised_label)
|
||||||
|
temp_df['mutation_info_REV'] = np.where( (temp_df['mutation_info_orig'] == old_label)
|
||||||
|
, revised_label
|
||||||
|
, temp_df['mutation_info_orig'])
|
||||||
|
ambig_muts_rev_df = pd.concat([ambig_muts_rev_df,temp_df])
|
||||||
|
f = max_info_table[[1]]
|
||||||
|
old_label_count = f[[0]][0]
|
||||||
|
changes_val.append(old_label_count)
|
||||||
|
cc_dict = f.to_dict()
|
||||||
|
changes_dict.update(cc_dict)
|
||||||
|
|
||||||
|
ambig_muts_rev_df['mutation_info_REV'].value_counts()
|
||||||
|
ambig_muts_rev_df['mutation_info_orig'].value_counts()
|
||||||
|
changes_val
|
||||||
|
changes_total = sum(changes_val)
|
||||||
|
changes_dict
|
||||||
|
n_changes = sum(changes_dict.values())
|
||||||
|
#%% OUTFILE 1, write file: ambiguous muts and ambiguous mut counts
|
||||||
|
#==================
|
||||||
|
# ambiguous muts
|
||||||
|
#==================
|
||||||
|
#dr_muts.XXX_csvXXXX('dr_muts.csv', header = True)
|
||||||
|
#other_muts.XXXX_csvXXX('other_muts.csv', header = True)
|
||||||
|
if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
|
||||||
|
out_filename_ambig_muts = gene.lower() + '_ambiguous_muts.csv'
|
||||||
|
outfile_ambig_muts = outdir + '/' + out_filename_ambig_muts
|
||||||
|
print('\n----------------------------------'
|
||||||
|
, '\nWriting file: ambiguous muts'
|
||||||
|
, '\n----------------------------------'
|
||||||
|
, '\nFilename:', outfile_ambig_muts)
|
||||||
|
inspect = gene_LF1[gene_LF1['mutation'].isin(common_muts)]
|
||||||
|
inspect.to_csv(outfile_ambig_muts, index = True)
|
||||||
|
|
||||||
|
print('Finished writing:', out_filename_ambig_muts
|
||||||
|
, '\nNo. of rows:', len(inspect)
|
||||||
|
, '\nNo. of cols:', len(inspect.columns)
|
||||||
|
, '\nNo. of rows = no. of samples with the ambiguous muts present:'
|
||||||
|
, dr_muts.isin(other_muts).sum() + other_muts.isin(dr_muts).sum()
|
||||||
|
, '\n=============================================================')
|
||||||
|
del(out_filename_ambig_muts)
|
||||||
|
|
||||||
|
#%% OUTFILE 2, write file: ambiguous mut counts
|
||||||
|
#======================
|
||||||
|
# ambiguous mut counts
|
||||||
|
#======================
|
||||||
|
out_filename_ambig_mut_counts = gene.lower() + '_ambiguous_mut_counts.csv'
|
||||||
|
outfile_ambig_mut_counts = outdir + '/' + out_filename_ambig_mut_counts
|
||||||
|
print('\n----------------------------------'
|
||||||
|
, '\nWriting file: ambiguous mut counts'
|
||||||
|
, '\n----------------------------------'
|
||||||
|
, '\nFilename:', outfile_ambig_mut_counts)
|
||||||
|
|
||||||
|
ambiguous_muts_value_counts.to_csv(outfile_ambig_mut_counts, index = True)
|
||||||
|
#%% FIXME: Add sanity check to make sure you can add value_count checks
|
||||||
|
#%% Resolving ambiguous muts: Merging ambiguous muts with the revised annotations
|
||||||
|
#=================
|
||||||
|
# Merge ambig muts
|
||||||
|
# with gene_LF1
|
||||||
|
#===================
|
||||||
|
ambig_muts_rev_df.index
|
||||||
|
gene_LF1.index
|
||||||
|
all(ambig_muts_rev_df.index.isin(gene_LF1.index))
|
||||||
|
|
||||||
|
any(gene_LF1.index.isin(ambig_muts_rev_df.index))
|
||||||
|
# if(gene_LF1.index.unique().isin(ambig_muts_rev_df.index).sum() == len(ambig_muts_rev_df)):
|
||||||
|
if(gene_LF1.index.unique().isin(ambig_muts_rev_df.index).sum() == ambig_muts_rev_df.index.nunique()):
|
||||||
|
|
||||||
ambiguous_muts_value_counts.to_csv(outfile_ambig_mut_counts, index = True)
|
print('\nPASS: ambiguous mut indices present in gene_LF1. Prepare to merge...')
|
||||||
#%% FIXME: Add sanity check to make sure you can add value_count checks
|
else:
|
||||||
#%% Resolving ambiguous muts: Merging ambiguous muts with the revised annotations
|
sys.exit('\nFAIL:ambiguous mut indices MISmatch. Check section Resolving ambiguous muts')
|
||||||
#=================
|
|
||||||
# Merge ambig muts
|
##########################################################################
|
||||||
# with gene_LF1
|
for index, row in ambig_muts_rev_df.iterrows():
|
||||||
#===================
|
curr_mut = row['mutation']
|
||||||
ambig_muts_rev_df.index
|
curr_rev = row['mutation_info_REV']
|
||||||
gene_LF1.index
|
print('\n=====\nAmbiguous Mutation: index:', index, '\nmutation:', curr_mut, '\nNew:', curr_rev, '\n=====\n' )
|
||||||
all(ambig_muts_rev_df.index.isin(gene_LF1.index))
|
print('\n-----\nReplacing original: index:', index, '\nmutation: '
|
||||||
|
, gene_LF1.loc[index,'mutation']
|
||||||
|
, '\nmutation_info to replace:'
|
||||||
|
, gene_LF1.loc[index,'mutation_info']
|
||||||
|
, '\nwith:', curr_rev, '\n-----')
|
||||||
|
replacement_row=(gene_LF1.index==index) & (gene_LF1['mutation'] == curr_mut)
|
||||||
|
gene_LF1.loc[replacement_row, 'mutation_info'] = curr_rev
|
||||||
|
|
||||||
|
###########################################################################
|
||||||
|
|
||||||
|
gene_LF1['mutation_info_orig'].value_counts()
|
||||||
|
gene_LF1['mutation_info_v1'].value_counts()
|
||||||
|
gene_LF1['mutation_info'].value_counts()
|
||||||
|
|
||||||
any(gene_LF1.index.isin(ambig_muts_rev_df.index))
|
# Sanity check1: if there are still any ambiguous muts
|
||||||
if(gene_LF1.index.unique().isin(ambig_muts_rev_df.index).sum() == len(ambig_muts_rev_df)):
|
#muts_split_rev = list(gene_LF1.groupby('mutation_info_v1'))
|
||||||
print('\nPASS: ambiguous mut indices present in gene_LF1. Prepare to merge...')
|
muts_split_rev = list(gene_LF1.groupby('mutation_info'))
|
||||||
|
dr_muts_rev = muts_split_rev[0][1].mutation
|
||||||
|
other_muts_rev = muts_split_rev[1][1].mutation
|
||||||
|
print('splitting muts by mut_info:', muts_split_rev)
|
||||||
|
print('no.of dr_muts samples:', len(dr_muts_rev))
|
||||||
|
print('no. of other_muts samples', len(other_muts_rev))
|
||||||
|
|
||||||
|
if not dr_muts_rev.isin(other_muts_rev).sum() & other_muts_rev.isin(dr_muts_rev).sum() > 0:
|
||||||
|
print('\nAmbiguous muts corrected. Proceeding with downstream analysis')
|
||||||
|
else:
|
||||||
|
print('\nAmbiguous muts NOT corrected. Quitting!')
|
||||||
|
sys.exit()
|
||||||
else:
|
else:
|
||||||
sys.exit('\nFAIL:ambiguous mut indices MISmatch. Check section Resolving ambiguous muts')
|
print('Mutations ARE NOT ambiguous, proceeding to downstream analyses')
|
||||||
|
|
||||||
#gene_LF1.loc[ambig_muts_rev_df.index, 'mutation_info_v1'] = ambig_muts_rev_df['mutation_info_REV']
|
|
||||||
gene_LF1.loc[ambig_muts_rev_df.index, 'mutation_info'] = ambig_muts_rev_df['mutation_info_REV']
|
|
||||||
|
|
||||||
gene_LF1['mutation_info_orig'].value_counts()
|
|
||||||
gene_LF1['mutation_info_v1'].value_counts()
|
|
||||||
|
|
||||||
# Sanity check1: if there are still any ambiguous muts
|
|
||||||
#muts_split_rev = list(gene_LF1.groupby('mutation_info_v1'))
|
|
||||||
muts_split_rev = list(gene_LF1.groupby('mutation_info'))
|
|
||||||
dr_muts_rev = muts_split_rev[0][1].mutation
|
|
||||||
other_muts_rev = muts_split_rev[1][1].mutation
|
|
||||||
print('splitting muts by mut_info:', muts_split_rev)
|
|
||||||
print('no.of dr_muts samples:', len(dr_muts_rev))
|
|
||||||
print('no. of other_muts samples', len(other_muts_rev))
|
|
||||||
|
|
||||||
if not dr_muts_rev.isin(other_muts_rev).sum() & other_muts_rev.isin(dr_muts_rev).sum() > 0:
|
|
||||||
print('\nAmbiguous muts corrected. Proceeding with downstream analysis')
|
|
||||||
else:
|
|
||||||
print('\nAmbiguous muts NOT corrected. Quitting!')
|
|
||||||
sys.exit()
|
|
||||||
|
|
||||||
#gene_LF1['mutation_info_v1'].value_counts()
|
#gene_LF1['mutation_info_v1'].value_counts()
|
||||||
gene_LF1['mutation_info'].value_counts()
|
gene_LF1['mutation_info'].value_counts()
|
||||||
|
@ -1649,7 +1676,7 @@ lf_lin_split['lineage_corrupt'].value_counts()
|
||||||
search = ";"
|
search = ";"
|
||||||
|
|
||||||
# count of occurrence of ";" in dr_muts_col: No.of semicolons + 1 is no. of rows created * occurence
|
# count of occurrence of ";" in dr_muts_col: No.of semicolons + 1 is no. of rows created * occurence
|
||||||
count_df_lin = gene_LF3_ColsSel[['lineage']]
|
count_df_lin = gene_LF3_ColsSel[['lineage']].copy()
|
||||||
count_df_lin['lineage_semicolon_count'] = gene_LF3_ColsSel.loc[:, 'lineage'].str.count(search, re.I)
|
count_df_lin['lineage_semicolon_count'] = gene_LF3_ColsSel.loc[:, 'lineage'].str.count(search, re.I)
|
||||||
lin_sc_C = count_df_lin['lineage_semicolon_count'].value_counts().reset_index()
|
lin_sc_C = count_df_lin['lineage_semicolon_count'].value_counts().reset_index()
|
||||||
lin_sc_C
|
lin_sc_C
|
||||||
|
@ -1708,7 +1735,7 @@ lf_lin_split['lineage_multimode'].value_counts()
|
||||||
#%% Select only the columns you want to merge from lf_lin_split
|
#%% Select only the columns you want to merge from lf_lin_split
|
||||||
lf_lin_split.columns
|
lf_lin_split.columns
|
||||||
lf_lin_split_ColSel = lf_lin_split[['lineage_corrupt_list','lineage_corrupt_count'
|
lf_lin_split_ColSel = lf_lin_split[['lineage_corrupt_list','lineage_corrupt_count'
|
||||||
, 'lineage_corrupt_ucount' ,'lineage_ulist', 'lineage_multimode']]
|
, 'lineage_corrupt_ucount' ,'lineage_ulist', 'lineage_multimode']].copy()
|
||||||
lf_lin_split_ColSel.columns
|
lf_lin_split_ColSel.columns
|
||||||
|
|
||||||
lf_lin_split_ColSel.rename(columns = {'lineage_corrupt_list' : 'lineage_list_all'
|
lf_lin_split_ColSel.rename(columns = {'lineage_corrupt_list' : 'lineage_list_all'
|
||||||
|
@ -1951,28 +1978,29 @@ print('\n============================================'
|
||||||
, '\n'
|
, '\n'
|
||||||
|
|
||||||
, '\nTotal no. of unique nsSNPs [check1: length of snps_only]:', len(snps_only)
|
, '\nTotal no. of unique nsSNPs [check1: length of snps_only]:', len(snps_only)
|
||||||
|
|
||||||
, '\nTotal no.of unique dr muts:' , dr_muts_rev.nunique()
|
|
||||||
, '\nTotal no.of unique other muts:' , other_muts_rev.nunique()
|
|
||||||
, '\nTotal no. of unique nsSNPs [check2: dr_muts + other_muts]:', dr_muts_rev.nunique()+other_muts_rev.nunique()
|
|
||||||
|
|
||||||
, '\nTotal no.of unique nSNSPs [check3, gene_LF4]:', gene_LF4['mutationinformation'].nunique()
|
, '\nTotal no.of unique nSNSPs [check3, gene_LF4]:', gene_LF4['mutationinformation'].nunique()
|
||||||
, '\nTotal no.of unique positions associated with missense muts:', gene_LF4['position'].nunique()
|
, '\nTotal no.of unique positions associated with missense muts:', gene_LF4['position'].nunique()
|
||||||
, '\nTotal no. of samples with nsSNPs:', len(gene_LF4)
|
, '\nTotal no. of samples with nsSNPs:', len(gene_LF4)
|
||||||
, '\nTotal no. of unique sample ids with nsSNPs:', gene_LF4['id'].nunique()
|
, '\nTotal no. of unique sample ids with nsSNPs:', gene_LF4['id'].nunique()
|
||||||
)
|
)
|
||||||
|
if 'common_muts' in globals():
|
||||||
if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
|
print('\nTotal no.of unique dr muts:' , dr_muts_rev.nunique()
|
||||||
print('\nTotal no.of samples with ambiguous muts:', len(inspect)
|
, '\nTotal no.of unique other muts:' , other_muts_rev.nunique()
|
||||||
#, '\nTotal no.of unique ambiguous muts:', len(common_muts)
|
, '\nTotal no. of unique nsSNPs [check2: dr_muts + other_muts]:', dr_muts_rev.nunique()+other_muts_rev.nunique()
|
||||||
, '\nTotal no.of unique ambiguous muts:', inspect['mutation'].nunique()
|
)
|
||||||
, '\n============================================================='
|
if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
|
||||||
, '\nPost resolving ambiguity\n'
|
print('\nTotal no.of samples with ambiguous muts:', len(inspect)
|
||||||
, ambig_muts_rev_df['mutation_info_REV'].value_counts())
|
#, '\nTotal no.of unique ambiguous muts:', len(common_muts)
|
||||||
|
, '\nTotal no.of unique ambiguous muts:', inspect['mutation'].nunique()
|
||||||
|
, '\n============================================================='
|
||||||
|
, '\nPost resolving ambiguity\n'
|
||||||
|
, ambig_muts_rev_df['mutation_info_REV'].value_counts())
|
||||||
|
else:
|
||||||
|
print('No ambiguous muts present, hence no summary')
|
||||||
|
|
||||||
print('\n============================================================='
|
print('\n============================================================='
|
||||||
, '\n============================================================='
|
, '\n============================================================='
|
||||||
, '\###############################\n'
|
, '\n###############################\n'
|
||||||
, '\nNumbers for ML workflows...'
|
, '\nNumbers for ML workflows...'
|
||||||
, '\n###############################\n'
|
, '\n###############################\n'
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue