From dd916926735370691d8942069f979ad59f5d79cb Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Mon, 23 Mar 2020 18:13:02 +0000
Subject: [PATCH] renamed files to make more generic

---
 meta_data_analysis/data_extraction.py | 31 +++++++++++++++------------
 1 file changed, 17 insertions(+), 14 deletions(-)
diff --git a/meta_data_analysis/data_extraction.py b/meta_data_analysis/data_extraction.py
index a47cec7..48c31e7 100755
--- a/meta_data_analysis/data_extraction.py
+++ b/meta_data_analysis/data_extraction.py
@@ -328,7 +328,7 @@ out_filename0 = gene.lower() + '_' + 'common_ids.csv'
 outfile0 = homedir + '/' + outdir + '/' + out_filename0
 
 #FIXME: CHECK line len(common_ids)
-print('Writing file: common ids:\n',
+print('Writing file: common ids:',
       '\nFilename:', out_filename0,
       '\nPath:', homedir +'/'+ outdir,
       '\nExpected no. of rows:', len(common_ids) )
@@ -459,7 +459,7 @@ dr_muts_df['dr_sample_freq'] = dr_muts_df.groupby('id')['id'].transform('count')
 print("revised dim of dr_muts_df:", dr_muts_df.shape) 
 
 c1 = dr_muts_df.dr_sample_freq.value_counts()
-print("counting no. of sample frequency\n:", c1)
+print('counting no. of sample frequency:\n', c1)
 print('======================================================================')
 
 # sanity check: length of pnca samples
@@ -521,7 +521,7 @@ other_muts_df['other_sample_freq'] = other_muts_df.groupby('id')['id'].transform
 print("revised dim of other_muts_df:", other_muts_df.shape) 
 
 c2 = other_muts_df.other_sample_freq.value_counts()
-print("counting no. of sample frequency\n:", c2)
+print('counting no. of sample frequency:\n', c2)
 print('======================================================================')
 # sanity check: length of pnca samples
 if len(other_pnca_WF1) == c2.sum():
@@ -696,7 +696,7 @@ else:
    print('Error: ambiguous muts detected, but extraction failed. Debug!',
          '\nNo. of ambiguous muts in dr:', len(dr_muts[dr_muts.isin(other_muts)].value_counts().keys().tolist() ),
          '\nNo. of ambiguous muts in other:', len(other_muts[other_muts.isin(dr_muts)].value_counts().keys().tolist()))       
-
+print('======================================================================')
 #%% clear variables
 del(id_dr, id_other, meta_data, meta_pnca_dr, meta_pnca_other, mut_grouped, muts_split, other_WF1, other_df, other_muts_df, other_pnca_count, pnca_LF0, pnca_na)  
 
@@ -712,7 +712,7 @@ del(c1, c2, col_to_split1, col_to_split2, comp_pnca_samples, dr_WF0, dr_df, dr_m
 
 out_filename1 = gene.lower() + '_' + 'ambiguous_muts.csv'
 outfile1 = homedir + '/' + outdir + '/' + out_filename1
-print('Writing file: ambiguous muts...',
+print('Writing file: ambiguous muts',
       '\nFilename:', out_filename1,
       '\nPath:',  homedir +'/'+ outdir)
 
@@ -869,16 +869,17 @@ out_filename2 = gene.lower() + '_' + 'mcsm_snps.csv'
 outfile2 = homedir + '/' + outdir + '/' + out_filename2
 
 print('Writing file: mCSM style muts',
+      '\nFilename:', out_filename2,
+      '\nPath:',  homedir +'/'+ outdir,
       '\nmutation format (SNP): {Wt}<POS>{Mut}',
       '\nNo. of distinct muts:', len(snps_only),
-      '\nNo. of distinct positions:', len(pos_only),
-      '\nFilename:', out_filename2,
-      '\nPath:',  homedir +'/'+ outdir)
+      '\nNo. of distinct positions:', len(pos_only))
 
 snps_only.to_csv(outfile2, header = False, index = False)
 
 print('Finished writing:', out_filename2,
-      '\nNo. of rows:', len(snps_only) )
+      '\nNo. of rows:', len(snps_only),
+      '\nNo. of cols:', len(snps_only.columns))
 print('======================================================================')
 del(out_filename2)
 
@@ -931,15 +932,17 @@ out_filename4 = gene.lower() + '_' + 'all_muts_msa.csv'
 outfile4 = homedir + '/' + outdir + '/' + out_filename4
 
 print('Writing file: mCSM style muts for msa',
+      '\nFilename:', out_filename4,
+      '\nPath:',  homedir +'/'+ outdir,
       '\nmutation format (SNP): {Wt}<POS>{Mut}',
       '\nNo.of lines of msa:', len(all_muts_msa),  
-      '\nFilename:', out_filename4,
-      '\nPath:',  homedir +'/'+ outdir)
+)
 
 all_muts_msa_sorted.to_csv(outfile4, header = False, index = False)
 
 print('Finished writing:', out_filename4,
-      '\nNo. of rows:', len(all_muts_msa) )
+      '\nNo. of rows:', len(all_muts_msa),
+      '\nNo. of cols:', len(all_muts_msa.columns) )
 print('======================================================================')
 del(out_filename4)
 
@@ -968,13 +971,13 @@ print('Writing file: mutational positions',
 pos_only_sorted.to_csv(outfile5, header = True, index = False)
 
 print('Finished writing:', out_filename5,
-      '\nNo. of rows:', len(pos_only_sorted) )
+      '\nNo. of rows:', len(pos_only_sorted),
+      '\nNo. of cols:', len(pos_only_sorted.columns) )
 print('======================================================================')
 del(out_filename5)
 
 
 #%% end of script
-print('======================================================================')
 print(u'\u2698' * 50,
       '\nEnd of script: Data extraction and writing files'
       '\n' + u'\u2698' * 50 )