add wrapper and mcsm library

2020-04-16 17:45:24 +01:00 · 2020-04-16 17:45:24 +01:00 · e50466da39
commit e50466da39
parent 7aafa72e10
6 changed files with 558 additions and 678 deletions
--- a/scripts/data_extraction.py
+++ b/scripts/data_extraction.py
@ -54,11 +54,14 @@ arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', defau
 args = arg_parser.parse_args()
 #=======================================================================
 #%% variable assignment: input and output paths & filenames
-#drug = 'pyrazinamide'
-#gene = 'pncA'
+drug = 'cycloserine'
+gene = 'alr'
+
 drug = args.drug
 gene = args.gene
+
 gene_match = gene + '_p.'
+
 # building cols to extract
 dr_muts_col = 'dr_mutations_' + drug
 other_muts_col = 'other_mutations_' + drug
@ -82,8 +85,7 @@ datadir = homedir + '/' + 'git/Data'
 #=======
 in_filename  = 'original_tanushree_data_v2.csv'
 infile = datadir + '/' + in_filename
-print('Input filename: ', in_filename
-      , '\nInput path: ', datadir
+print('Input file: ', infile
      , '\n============================================================')

 #=======
@ -352,9 +354,8 @@ out_filename0 = gene.lower() + '_common_ids.csv'
 outfile0 =  outdir + '/' + out_filename0

 #FIXME: CHECK line len(common_ids)
-print('Writing file: common ids:'
-      , '\nFilename:', out_filename0
-      , '\nPath:', outdir
+print('Writing file:'
+      , '\nFile:', outfile0
      , '\nExpected no. of rows:', len(common_ids)
      , '\n=============================================================')

@ -530,7 +531,7 @@ print('lengths after tidy split and extracting', gene_match, 'muts:',
      '\nexpected len:', other_gene_count)

 if len(other_gene_WF1) == other_gene_count:
-    print('PASS: length of dr_gene_WF0 match with expected length'
+    print('PASS: length matches with expected length'
    , '\n===============================================================')
 else:
    print('FAIL: lengths mismatch'
@ -685,12 +686,12 @@ else:
          , '\nmuts should be distinct within dr* and other* type'
          , '\ninspecting ...'
          , '\n=========================================================')
-    muts_split = list(gene_LF1.groupby('mutation_info'))
-    dr_muts = muts_split[0][1].mutation 
-    other_muts =  muts_split[1][1].mutation
-#   print('splitting muts by mut_info:', muts_split)
-    print('no.of dr_muts samples:', len(dr_muts))
-    print('no. of other_muts samples', len(other_muts))        
+muts_split = list(gene_LF1.groupby('mutation_info'))
+dr_muts = muts_split[0][1].mutation 
+other_muts =  muts_split[1][1].mutation
+print('splitting muts by mut_info:', muts_split)
+print('no.of dr_muts samples:', len(dr_muts))
+print('no. of other_muts samples', len(other_muts))        
 #%%
 # !!! IMPORTANT !!!!
 # sanity check: There should not be any common muts