added output file for checking

2020-08-11 18:34:02 +01:00 · 2020-08-11 18:34:02 +01:00 · 833e599550
commit 833e599550
parent dbf8865203
2 changed files with 38 additions and 352 deletions
--- a/scripts/data_extraction.py
+++ b/scripts/data_extraction.py
@ -52,7 +52,7 @@ Created on Tue Aug  6 12:56:03 2019
 import os, sys
 import re
 import pandas as pd
-#import numpy as np
+import numpy as np
 import argparse
 #=======================================================================
 #%% homdir and curr dir and local imports
@ -68,18 +68,17 @@ from tidy_split import tidy_split
 #=======================================================================
 #%% command line args
 arg_parser = argparse.ArgumentParser()
-arg_parser.add_argument('-d', '--drug', help='drug name', default = None)
+arg_parser.add_argument('-d', '--drug', help='drug name (case sensitive)', default = None)
 arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = None)

-
 args = arg_parser.parse_args()
 #=======================================================================
 #%% variable assignment: input and output paths & filenames
-#drug = args.drug
-#gene = args.gene
+drug = args.drug
+gene = args.gene

-drug = 'pyrazinamide'
-gene = 'pncA'
+#drug = 'pyrazinamide'
+#gene = 'pncA'

 gene_match = gene + '_p.'
 print('mut pattern for gene', gene, ':',  gene_match)
@ -99,6 +98,7 @@ print('position regex:', pos_regex)
 # building cols to extract
 dr_muts_col = 'dr_mutations_' + drug
 other_muts_col = 'other_mutations_' + drug
+resistance_col = 'drtype'

 print('Extracting columns based on variables:\n'
 	, drug
@ -106,6 +106,8 @@ print('Extracting columns based on variables:\n'
 	, dr_muts_col
 	, '\n'
 	, other_muts_col
+    , '\n'
+    , resistance_col
 	, '\n===============================================================')
 #=======================================================================
 #%% input and output dirs and files
@ -120,7 +122,7 @@ outdir = datadir + '/' + drug + '/' + 'output'
 # input
 #=======
 #in_filename_master_master  = 'original_tanushree_data_v2.csv' #19k
-in_filename_master  = 'mtb_gwas_meta_v3.csv' #33k
+in_filename_master  = 'mtb_gwas_meta_v5.csv' #34k
 infile_master = datadir + '/' + in_filename_master
 print('Input file: ', infile_master
      , '\n============================================================')
@ -147,33 +149,37 @@ if in_filename_master == 'original_tanushree_data_v2.csv':
                             , 'country'
                             , 'lineage'
                             , 'sublineage'
-                             , 'drtype' #19k only
+                             , 'drtype'
                             , drug
                             , dr_muts_col
                             , other_muts_col]]
    
-if in_filename_master == 'mtb_gwas_meta_v3.csv':
+if in_filename_master == 'mtb_gwas_meta_v5.csv':
    core_cols = ['id'
-                 , 'country'
-                 , 'country2'
-                 , 'geographic_source'
-                 , 'region'
-                 , 'date'
+                 , 'sample'
+                 , 'patient_id'
                 , 'strain'
                 , 'lineage'
-                 , 'sublineage' #drtype renamed to resistance
-                 , 'resistance'
+                 , 'sublineage' 
+                 , 'country'
+                 , 'country_code'
+                 , 'geographic_source'
+                 #, 'region'
                 , 'location'
                 , 'host_body_site'
                 , 'environment_material'
                 , 'host_status'
+                 , 'host_sex'
+                 , 'submitted_host_sex'
                 , 'hiv_status'
                 , 'HIV_status'
+                 , 'tissue_type'
                 , 'isolation_source']
    
    variable_based_cols = [drug
                           , dr_muts_col
-                           , other_muts_col]
+                           , other_muts_col
+                           , resistance_col]
       
    cols_to_extract = core_cols + variable_based_cols
    print('Extracting', len(cols_to_extract), 'columns from master data')
@ -193,7 +199,14 @@ print('RESULT: Total samples:', total_samples
 meta_data.isna().sum()
 print('No. of NAs/column:' + '\n', meta_data.isna().sum()
 		, '\n===========================================================')
-#
+
+#%% Write check file
+check_file = outdir + '/' + gene.lower() + '_gwas.csv'
+meta_data.to_csv(check_file)
+print('Writing subsetted gwas data'
+      , '\nFile', check_file
+      , '\nDim:', meta_data.shape)
+
 # glance
 #meta_data.head()
 #total_samples - NA pyrazinamide = ?
@ -203,7 +216,10 @@ print('No. of NAs/column:' + '\n', meta_data.isna().sum()
 # equivalent of table in R
 # drug counts: complete samples for OR calcs
 meta_data[drug].value_counts() 
-print('RESULT: Sus and Res samples:\n', meta_data[drug].value_counts()
+print('===========================================================\n'
+      , 'RESULT: No. of Sus and Res samples:\n', meta_data[drug].value_counts()
+	  , '\n===========================================================\n'
+      , 'RESULT: Percentage of Sus and Res samples:\n', meta_data[drug].value_counts(normalize = True)*100
 		, '\n===========================================================')

 #%%
@ -306,7 +322,8 @@ print('Predicting total no. of rows in the curated df:', dr_gene_count + other_g
      , '\n===================================================================')
 expected_rows = dr_gene_count + other_gene_count

-del(i, id, wt_other, clean_df, na_count, id2_other, count_gene_other, count_wt)
+#del( wt_other, clean_df, i, id, na_count, id2_other, count_gene_other, count_wt)
+del(clean_df, na_count, i, id, wt_other, id2_other, count_gene_other,count_wt )

 #%%
 ############