sorted df by position for output in data_extraction

This commit is contained in:
Tanushree Tunstall 2020-08-14 17:57:12 +01:00
parent 48773a19ef
commit 841d18d10b

View file

@ -1094,6 +1094,24 @@ print('Created column: mutationinformation'
, '\n=====================================================================\n' , '\n=====================================================================\n'
, gene_LF1.mutationinformation.head(10)) , gene_LF1.mutationinformation.head(10))
#order by position for convenience
gene_LF1.dtypes
# converting position to numeric
gene_LF1['position'] = pd.to_numeric(gene_LF1['position'])
# sort by position inplace
foo = gene_LF1['position'].value_counts()
gene_LF1.sort_values(by = ['position'], inplace = True)
bar = gene_LF1['position'].value_counts()
if (foo == bar).all():
print('PASS: df ordered by position')
print(gene_LF1['position'].head())
else:
print('FAIL: df could not be orderd. Check source')
sys.exit()
#%% Write file: mCSM muts #%% Write file: mCSM muts
snps_only = pd.DataFrame(gene_LF1['mutationinformation'].unique()) snps_only = pd.DataFrame(gene_LF1['mutationinformation'].unique())
snps_only.head() snps_only.head()
@ -1128,6 +1146,31 @@ print('Finished writing:', outfile_mcsmsnps
, '\n=============================================================') , '\n=============================================================')
del(out_filename_mcsmsnps) del(out_filename_mcsmsnps)
#%%# write frequency of position counts
metadata_pos = pd.DataFrame(gene_LF1['position'])
z = gene_LF1['position'].value_counts()
z1 = z.to_dict()
metadata_pos['meta_pos_count'] = metadata_pos['position'].map(z1)
metadata_pos['meta_pos_count'].value_counts()
metadata_pos.sort_values(by = ['meta_pos_count'], ascending = False, inplace = True)
# Write file: gene_metadata (i.e gene_LF1)
# where each row has UNIQUE mutations NOT unique sample ids
out_filename_metadata_poscounts = gene.lower() + '_metadata_poscounts.csv'
outfile_metadata_poscounts = outdir + '/' + out_filename_metadata_poscounts
print('Writing file: Metadata poscounts'
, '\nFile:', outfile_metadata_poscounts
, '\n============================================================')
metadata_pos.to_csv(outfile_metadata_poscounts, header = True, index = False)
print('Finished writing:', outfile_metadata_poscounts
, '\nNo. of rows:', len(metadata_pos)
, '\nNo. of cols:', len(metadata_pos.columns)
, '\n=============================================================')
del(out_filename_metadata_poscounts)
#%% Write file: gene_metadata (i.e gene_LF1) #%% Write file: gene_metadata (i.e gene_LF1)
# where each row has UNIQUE mutations NOT unique sample ids # where each row has UNIQUE mutations NOT unique sample ids
out_filename_metadata = gene.lower() + '_metadata.csv' out_filename_metadata = gene.lower() + '_metadata.csv'
@ -1213,9 +1256,20 @@ pos_only_sorted.to_csv(outfile_pos, header = True, index = False)
print('Finished writing:', outfile_pos print('Finished writing:', outfile_pos
, '\nNo. of rows:', len(pos_only_sorted) , '\nNo. of rows:', len(pos_only_sorted)
, '\nNo. of cols:', len(pos_only_sorted.columns) , '\nNo. of cols:', len(pos_only_sorted.columns)
, '\n=============================================================') , '\n============================================================='
, '\n\n\n')
del(out_filename_pos) del(out_filename_pos)
#%% quick summary output
print('============================================'
, '\nQuick summary output for', gene.lower()
, '\n============================================'
, '\nTotal no.of unique missense muts:', gene_LF1['mutationinformation'].nunique()
, '\nTotal no.of unique positions associated with missense muts:',gene_LF1['position'].nunique()
, '\nTotal no. of samples with missense muts:', len(gene_LF1)
, '\n============================================================='
, '\n\n\n')
#======================================================================= #=======================================================================
print(u'\u2698' * 50, print(u'\u2698' * 50,
'\nEnd of script: Data extraction and writing files' '\nEnd of script: Data extraction and writing files'