sorted df by position for output in data_extraction

2020-08-14 17:57:12 +01:00 · 2020-08-14 17:57:12 +01:00 · 841d18d10b
commit 841d18d10b
parent 48773a19ef
1 changed files with 55 additions and 1 deletions
--- a/scripts/data_extraction.py
+++ b/scripts/data_extraction.py
@ -1094,6 +1094,24 @@ print('Created column: mutationinformation'
 	, '\n=====================================================================\n'
    , gene_LF1.mutationinformation.head(10))

+#order by position for convenience
+gene_LF1.dtypes
+
+# converting position to numeric
+gene_LF1['position'] = pd.to_numeric(gene_LF1['position'])
+
+# sort by position inplace 
+foo = gene_LF1['position'].value_counts()
+gene_LF1.sort_values(by = ['position'], inplace = True)
+bar = gene_LF1['position'].value_counts()
+
+if (foo == bar).all():
+    print('PASS: df ordered by position')
+    print(gene_LF1['position'].head())
+else:
+    print('FAIL: df could not be orderd. Check source')
+    sys.exit()
+
 #%% Write file: mCSM muts
 snps_only = pd.DataFrame(gene_LF1['mutationinformation'].unique())
 snps_only.head()
@ -1128,6 +1146,31 @@ print('Finished writing:', outfile_mcsmsnps
      , '\n=============================================================')
 del(out_filename_mcsmsnps)

+#%%# write frequency of position counts
+metadata_pos = pd.DataFrame(gene_LF1['position'])
+z =  gene_LF1['position'].value_counts()
+z1 = z.to_dict()
+metadata_pos['meta_pos_count'] = metadata_pos['position'].map(z1)
+metadata_pos['meta_pos_count'].value_counts()
+
+metadata_pos.sort_values(by = ['meta_pos_count'], ascending = False, inplace = True)
+
+# Write file: gene_metadata (i.e gene_LF1)
+# where each row has UNIQUE mutations NOT unique sample ids
+out_filename_metadata_poscounts = gene.lower() + '_metadata_poscounts.csv'
+outfile_metadata_poscounts = outdir + '/' + out_filename_metadata_poscounts
+print('Writing file: Metadata poscounts'
+      , '\nFile:', outfile_metadata_poscounts
+      , '\n============================================================')
+
+metadata_pos.to_csv(outfile_metadata_poscounts, header = True, index = False)
+print('Finished writing:', outfile_metadata_poscounts
+      , '\nNo. of rows:', len(metadata_pos)
+      , '\nNo. of cols:', len(metadata_pos.columns)
+      , '\n=============================================================')
+del(out_filename_metadata_poscounts)
+
+
 #%% Write file: gene_metadata (i.e gene_LF1)
 # where each row has UNIQUE mutations NOT unique sample ids
 out_filename_metadata = gene.lower() + '_metadata.csv'
@ -1213,9 +1256,20 @@ pos_only_sorted.to_csv(outfile_pos, header = True, index = False)
 print('Finished writing:', outfile_pos
      , '\nNo. of rows:', len(pos_only_sorted)
      , '\nNo. of cols:', len(pos_only_sorted.columns)
-      , '\n=============================================================')
+      , '\n============================================================='
+      , '\n\n\n')

 del(out_filename_pos)
+#%% quick summary output
+print('============================================'
+      , '\nQuick summary output for', gene.lower()
+      , '\n============================================'
+      , '\nTotal no.of unique missense muts:', gene_LF1['mutationinformation'].nunique()
+      , '\nTotal no.of unique positions associated with missense muts:',gene_LF1['position'].nunique()
+      , '\nTotal no. of samples with missense muts:', len(gene_LF1)
+      , '\n============================================================='
+      , '\n\n\n')
+
 #=======================================================================
 print(u'\u2698' * 50,
      '\nEnd of script: Data extraction and writing files'