sorted df by position for output in data_extraction
This commit is contained in:
parent
48773a19ef
commit
841d18d10b
1 changed files with 55 additions and 1 deletions
|
@ -1094,6 +1094,24 @@ print('Created column: mutationinformation'
|
|||
, '\n=====================================================================\n'
|
||||
, gene_LF1.mutationinformation.head(10))
|
||||
|
||||
#order by position for convenience
|
||||
gene_LF1.dtypes
|
||||
|
||||
# converting position to numeric
|
||||
gene_LF1['position'] = pd.to_numeric(gene_LF1['position'])
|
||||
|
||||
# sort by position inplace
|
||||
foo = gene_LF1['position'].value_counts()
|
||||
gene_LF1.sort_values(by = ['position'], inplace = True)
|
||||
bar = gene_LF1['position'].value_counts()
|
||||
|
||||
if (foo == bar).all():
|
||||
print('PASS: df ordered by position')
|
||||
print(gene_LF1['position'].head())
|
||||
else:
|
||||
print('FAIL: df could not be orderd. Check source')
|
||||
sys.exit()
|
||||
|
||||
#%% Write file: mCSM muts
|
||||
snps_only = pd.DataFrame(gene_LF1['mutationinformation'].unique())
|
||||
snps_only.head()
|
||||
|
@ -1128,6 +1146,31 @@ print('Finished writing:', outfile_mcsmsnps
|
|||
, '\n=============================================================')
|
||||
del(out_filename_mcsmsnps)
|
||||
|
||||
#%%# write frequency of position counts
|
||||
metadata_pos = pd.DataFrame(gene_LF1['position'])
|
||||
z = gene_LF1['position'].value_counts()
|
||||
z1 = z.to_dict()
|
||||
metadata_pos['meta_pos_count'] = metadata_pos['position'].map(z1)
|
||||
metadata_pos['meta_pos_count'].value_counts()
|
||||
|
||||
metadata_pos.sort_values(by = ['meta_pos_count'], ascending = False, inplace = True)
|
||||
|
||||
# Write file: gene_metadata (i.e gene_LF1)
|
||||
# where each row has UNIQUE mutations NOT unique sample ids
|
||||
out_filename_metadata_poscounts = gene.lower() + '_metadata_poscounts.csv'
|
||||
outfile_metadata_poscounts = outdir + '/' + out_filename_metadata_poscounts
|
||||
print('Writing file: Metadata poscounts'
|
||||
, '\nFile:', outfile_metadata_poscounts
|
||||
, '\n============================================================')
|
||||
|
||||
metadata_pos.to_csv(outfile_metadata_poscounts, header = True, index = False)
|
||||
print('Finished writing:', outfile_metadata_poscounts
|
||||
, '\nNo. of rows:', len(metadata_pos)
|
||||
, '\nNo. of cols:', len(metadata_pos.columns)
|
||||
, '\n=============================================================')
|
||||
del(out_filename_metadata_poscounts)
|
||||
|
||||
|
||||
#%% Write file: gene_metadata (i.e gene_LF1)
|
||||
# where each row has UNIQUE mutations NOT unique sample ids
|
||||
out_filename_metadata = gene.lower() + '_metadata.csv'
|
||||
|
@ -1213,9 +1256,20 @@ pos_only_sorted.to_csv(outfile_pos, header = True, index = False)
|
|||
print('Finished writing:', outfile_pos
|
||||
, '\nNo. of rows:', len(pos_only_sorted)
|
||||
, '\nNo. of cols:', len(pos_only_sorted.columns)
|
||||
, '\n=============================================================')
|
||||
, '\n============================================================='
|
||||
, '\n\n\n')
|
||||
|
||||
del(out_filename_pos)
|
||||
#%% quick summary output
|
||||
print('============================================'
|
||||
, '\nQuick summary output for', gene.lower()
|
||||
, '\n============================================'
|
||||
, '\nTotal no.of unique missense muts:', gene_LF1['mutationinformation'].nunique()
|
||||
, '\nTotal no.of unique positions associated with missense muts:',gene_LF1['position'].nunique()
|
||||
, '\nTotal no. of samples with missense muts:', len(gene_LF1)
|
||||
, '\n============================================================='
|
||||
, '\n\n\n')
|
||||
|
||||
#=======================================================================
|
||||
print(u'\u2698' * 50,
|
||||
'\nEnd of script: Data extraction and writing files'
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue