diff --git a/scripts/data_extraction.py b/scripts/data_extraction.py index b8694b5..2f3531a 100755 --- a/scripts/data_extraction.py +++ b/scripts/data_extraction.py @@ -1094,6 +1094,24 @@ print('Created column: mutationinformation' , '\n=====================================================================\n' , gene_LF1.mutationinformation.head(10)) +#order by position for convenience +gene_LF1.dtypes + +# converting position to numeric +gene_LF1['position'] = pd.to_numeric(gene_LF1['position']) + +# sort by position inplace +foo = gene_LF1['position'].value_counts() +gene_LF1.sort_values(by = ['position'], inplace = True) +bar = gene_LF1['position'].value_counts() + +if (foo == bar).all(): + print('PASS: df ordered by position') + print(gene_LF1['position'].head()) +else: + print('FAIL: df could not be orderd. Check source') + sys.exit() + #%% Write file: mCSM muts snps_only = pd.DataFrame(gene_LF1['mutationinformation'].unique()) snps_only.head() @@ -1128,6 +1146,31 @@ print('Finished writing:', outfile_mcsmsnps , '\n=============================================================') del(out_filename_mcsmsnps) +#%%# write frequency of position counts +metadata_pos = pd.DataFrame(gene_LF1['position']) +z = gene_LF1['position'].value_counts() +z1 = z.to_dict() +metadata_pos['meta_pos_count'] = metadata_pos['position'].map(z1) +metadata_pos['meta_pos_count'].value_counts() + +metadata_pos.sort_values(by = ['meta_pos_count'], ascending = False, inplace = True) + +# Write file: gene_metadata (i.e gene_LF1) +# where each row has UNIQUE mutations NOT unique sample ids +out_filename_metadata_poscounts = gene.lower() + '_metadata_poscounts.csv' +outfile_metadata_poscounts = outdir + '/' + out_filename_metadata_poscounts +print('Writing file: Metadata poscounts' + , '\nFile:', outfile_metadata_poscounts + , '\n============================================================') + +metadata_pos.to_csv(outfile_metadata_poscounts, header = True, index = False) +print('Finished writing:', outfile_metadata_poscounts + , '\nNo. of rows:', len(metadata_pos) + , '\nNo. of cols:', len(metadata_pos.columns) + , '\n=============================================================') +del(out_filename_metadata_poscounts) + + #%% Write file: gene_metadata (i.e gene_LF1) # where each row has UNIQUE mutations NOT unique sample ids out_filename_metadata = gene.lower() + '_metadata.csv' @@ -1213,9 +1256,20 @@ pos_only_sorted.to_csv(outfile_pos, header = True, index = False) print('Finished writing:', outfile_pos , '\nNo. of rows:', len(pos_only_sorted) , '\nNo. of cols:', len(pos_only_sorted.columns) - , '\n=============================================================') + , '\n=============================================================' + , '\n\n\n') del(out_filename_pos) +#%% quick summary output +print('============================================' + , '\nQuick summary output for', gene.lower() + , '\n============================================' + , '\nTotal no.of unique missense muts:', gene_LF1['mutationinformation'].nunique() + , '\nTotal no.of unique positions associated with missense muts:',gene_LF1['position'].nunique() + , '\nTotal no. of samples with missense muts:', len(gene_LF1) + , '\n=============================================================' + , '\n\n\n') + #======================================================================= print(u'\u2698' * 50, '\nEnd of script: Data extraction and writing files'