sorted df by position for output in data_extraction

2020-08-14 17:57:12 +01:00 · 2020-08-14 17:57:12 +01:00 · 841d18d10b
commit 841d18d10b
parent 48773a19ef
1 changed files with 55 additions and 1 deletions
--- a/scripts/data_extraction.py
+++ b/scripts/data_extraction.py
@ -1094,6 +1094,24 @@ print('Created column: mutationinformation'
 	, '\n=====================================================================\n'
    , gene_LF1.mutationinformation.head(10))
 #order by position for convenience
 gene_LF1.dtypes
 # converting position to numeric
 gene_LF1['position'] = pd.to_numeric(gene_LF1['position'])
 # sort by position inplace 
 foo = gene_LF1['position'].value_counts()
 gene_LF1.sort_values(by = ['position'], inplace = True)
 bar = gene_LF1['position'].value_counts()
 if (foo == bar).all():
    print('PASS: df ordered by position')
    print(gene_LF1['position'].head())
 else:
    print('FAIL: df could not be orderd. Check source')
    sys.exit()
 #%% Write file: mCSM muts
 snps_only = pd.DataFrame(gene_LF1['mutationinformation'].unique())
 snps_only.head()
@ -1128,6 +1146,31 @@ print('Finished writing:', outfile_mcsmsnps
      , '\n=============================================================')
 del(out_filename_mcsmsnps)
 #%%# write frequency of position counts
 metadata_pos = pd.DataFrame(gene_LF1['position'])
 z =  gene_LF1['position'].value_counts()
 z1 = z.to_dict()
 metadata_pos['meta_pos_count'] = metadata_pos['position'].map(z1)
 metadata_pos['meta_pos_count'].value_counts()
 metadata_pos.sort_values(by = ['meta_pos_count'], ascending = False, inplace = True)
 # Write file: gene_metadata (i.e gene_LF1)
 # where each row has UNIQUE mutations NOT unique sample ids
 out_filename_metadata_poscounts = gene.lower() + '_metadata_poscounts.csv'
 outfile_metadata_poscounts = outdir + '/' + out_filename_metadata_poscounts
 print('Writing file: Metadata poscounts'
      , '\nFile:', outfile_metadata_poscounts
      , '\n============================================================')
 metadata_pos.to_csv(outfile_metadata_poscounts, header = True, index = False)
 print('Finished writing:', outfile_metadata_poscounts
      , '\nNo. of rows:', len(metadata_pos)
      , '\nNo. of cols:', len(metadata_pos.columns)
      , '\n=============================================================')
 del(out_filename_metadata_poscounts)
 #%% Write file: gene_metadata (i.e gene_LF1)
 # where each row has UNIQUE mutations NOT unique sample ids
 out_filename_metadata = gene.lower() + '_metadata.csv'
@ -1213,9 +1256,20 @@ pos_only_sorted.to_csv(outfile_pos, header = True, index = False)
 print('Finished writing:', outfile_pos
      , '\nNo. of rows:', len(pos_only_sorted)
      , '\nNo. of cols:', len(pos_only_sorted.columns)
-      , '\n=============================================================')
+      , '\n============================================================='
      , '\n\n\n')
 del(out_filename_pos)
 #%% quick summary output
 print('============================================'
      , '\nQuick summary output for', gene.lower()
      , '\n============================================'
      , '\nTotal no.of unique missense muts:', gene_LF1['mutationinformation'].nunique()
      , '\nTotal no.of unique positions associated with missense muts:',gene_LF1['position'].nunique()
      , '\nTotal no. of samples with missense muts:', len(gene_LF1)
      , '\n============================================================='
      , '\n\n\n')
 #=======================================================================
 print(u'\u2698' * 50,
      '\nEnd of script: Data extraction and writing files'