From 2d8cb01cb72fcad685b8f2025abea4a73250772e Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Tue, 11 Aug 2020 18:34:02 +0100
Subject: [PATCH] added output file for checking

---
 scripts/data_extraction.py     |  59 +++---
 scripts/pre_data_extraction.py | 331 ---------------------------------
 2 files changed, 38 insertions(+), 352 deletions(-)
 delete mode 100755 scripts/pre_data_extraction.py

diff --git a/scripts/data_extraction.py b/scripts/data_extraction.py
index a682fbc..262175e 100755
--- a/scripts/data_extraction.py
+++ b/scripts/data_extraction.py
@@ -52,7 +52,7 @@ Created on Tue Aug  6 12:56:03 2019
 import os, sys
 import re
 import pandas as pd
-#import numpy as np
+import numpy as np
 import argparse
 #=======================================================================
 #%% homdir and curr dir and local imports
@@ -68,18 +68,17 @@ from tidy_split import tidy_split
 #=======================================================================
 #%% command line args
 arg_parser = argparse.ArgumentParser()
-arg_parser.add_argument('-d', '--drug', help='drug name', default = None)
+arg_parser.add_argument('-d', '--drug', help='drug name (case sensitive)', default = None)
 arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = None)
 
-
 args = arg_parser.parse_args()
 #=======================================================================
 #%% variable assignment: input and output paths & filenames
-#drug = args.drug
-#gene = args.gene
+drug = args.drug
+gene = args.gene
 
-drug = 'pyrazinamide'
-gene = 'pncA'
+#drug = 'pyrazinamide'
+#gene = 'pncA'
 
 gene_match = gene + '_p.'
 print('mut pattern for gene', gene, ':',  gene_match)
@@ -99,6 +98,7 @@ print('position regex:', pos_regex)
 # building cols to extract
 dr_muts_col = 'dr_mutations_' + drug
 other_muts_col = 'other_mutations_' + drug
+resistance_col = 'drtype'
 
 print('Extracting columns based on variables:\n'
 	, drug
@@ -106,6 +106,8 @@ print('Extracting columns based on variables:\n'
 	, dr_muts_col
 	, '\n'
 	, other_muts_col
+    , '\n'
+    , resistance_col
 	, '\n===============================================================')
 #=======================================================================
 #%% input and output dirs and files
@@ -120,7 +122,7 @@ outdir = datadir + '/' + drug + '/' + 'output'
 # input
 #=======
 #in_filename_master_master  = 'original_tanushree_data_v2.csv' #19k
-in_filename_master  = 'mtb_gwas_meta_v3.csv' #33k
+in_filename_master  = 'mtb_gwas_meta_v5.csv' #34k
 infile_master = datadir + '/' + in_filename_master
 print('Input file: ', infile_master
       , '\n============================================================')
@@ -147,33 +149,37 @@ if in_filename_master == 'original_tanushree_data_v2.csv':
                              , 'country'
                              , 'lineage'
                              , 'sublineage'
-                             , 'drtype' #19k only
+                             , 'drtype'
                              , drug
                              , dr_muts_col
                              , other_muts_col]]
     
-if in_filename_master == 'mtb_gwas_meta_v3.csv':
+if in_filename_master == 'mtb_gwas_meta_v5.csv':
     core_cols = ['id'
-                 , 'country'
-                 , 'country2'
-                 , 'geographic_source'
-                 , 'region'
-                 , 'date'
+                 , 'sample'
+                 , 'patient_id'
                  , 'strain'
                  , 'lineage'
-                 , 'sublineage' #drtype renamed to resistance
-                 , 'resistance'
+                 , 'sublineage' 
+                 , 'country'
+                 , 'country_code'
+                 , 'geographic_source'
+                 #, 'region'
                  , 'location'
                  , 'host_body_site'
                  , 'environment_material'
                  , 'host_status'
+                 , 'host_sex'
+                 , 'submitted_host_sex'
                  , 'hiv_status'
                  , 'HIV_status'
+                 , 'tissue_type'
                  , 'isolation_source']
     
     variable_based_cols = [drug
                            , dr_muts_col
-                           , other_muts_col]
+                           , other_muts_col
+                           , resistance_col]
        
     cols_to_extract = core_cols + variable_based_cols
     print('Extracting', len(cols_to_extract), 'columns from master data')
@@ -193,7 +199,14 @@ print('RESULT: Total samples:', total_samples
 meta_data.isna().sum()
 print('No. of NAs/column:' + '\n', meta_data.isna().sum()
 		, '\n===========================================================')
-#
+
+#%% Write check file
+check_file = outdir + '/' + gene.lower() + '_gwas.csv'
+meta_data.to_csv(check_file)
+print('Writing subsetted gwas data'
+      , '\nFile', check_file
+      , '\nDim:', meta_data.shape)
+
 # glance
 #meta_data.head()
 #total_samples - NA pyrazinamide = ?
@@ -203,7 +216,10 @@ print('No. of NAs/column:' + '\n', meta_data.isna().sum()
 # equivalent of table in R
 # drug counts: complete samples for OR calcs
 meta_data[drug].value_counts() 
-print('RESULT: Sus and Res samples:\n', meta_data[drug].value_counts()
+print('===========================================================\n'
+      , 'RESULT: No. of Sus and Res samples:\n', meta_data[drug].value_counts()
+	  , '\n===========================================================\n'
+      , 'RESULT: Percentage of Sus and Res samples:\n', meta_data[drug].value_counts(normalize = True)*100
 		, '\n===========================================================')
 
 #%%
@@ -306,7 +322,8 @@ print('Predicting total no. of rows in the curated df:', dr_gene_count + other_g
       , '\n===================================================================')
 expected_rows = dr_gene_count + other_gene_count
 
-del(i, id, wt_other, clean_df, na_count, id2_other, count_gene_other, count_wt)
+#del( wt_other, clean_df, i, id, na_count, id2_other, count_gene_other, count_wt)
+del(clean_df, na_count, i, id, wt_other, id2_other, count_gene_other,count_wt )
 
 #%%
 ############
diff --git a/scripts/pre_data_extraction.py b/scripts/pre_data_extraction.py
deleted file mode 100755
index 876299b..0000000
--- a/scripts/pre_data_extraction.py
+++ /dev/null
@@ -1,331 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-'''
-Created on Tue Aug  6 12:56:03 2019
-
-@author: tanu
-'''
-
-# FIXME: include error checking to enure you only
-# concentrate on positions that have structural info?
-
-# FIXME: import dirs.py to get the basic dir paths available
-#=======================================================================
-# TASK: 
-
-
-#=======================================================================
-#%% load libraries
-import os, sys
-import re
-import pandas as pd
-import numpy as np
-import argparse
-#=======================================================================
-#%% homdir and curr dir and local imports
-homedir = os.path.expanduser('~') 
-# set working dir
-os.getcwd()
-os.chdir(homedir + '/git/LSHTM_analysis/scripts')
-os.getcwd()
-
-# import aa dict
-#from reference_dict import my_aa_dict # CHECK DIR STRUC THERE!
-#from tidy_split import tidy_split
-#=======================================================================
-#%% command line args
-arg_parser = argparse.ArgumentParser()
-arg_parser.add_argument('-d', '--drug', help='drug name', default = None)
-arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = None)
-
-
-args = arg_parser.parse_args()
-#=======================================================================
-#%% variable assignment: input and output paths & filenames
-#drug = args.drug
-#gene = args.gene
-
-drug = 'pyrazinamide'
-gene = 'pncA'
-
-gene_match = gene + '_p.'
-print('mut pattern for gene', gene, ':',  gene_match)
-
-nssnp_match = gene_match +'[A-Z]{3}[0-9]+[A-Z]{3}'
-print('nsSNP for gene', gene, ':',  nssnp_match)
-
-wt_regex = gene_match.lower()+'(\w{3})'
-print('wt regex:', wt_regex)
-
-mut_regex = r'\d+(\w{3})$'
-print('mt regex:', mut_regex)
-
-pos_regex = r'(\d+)'
-print('position regex:', pos_regex)
-
-# building cols to extract
-dr_muts_col = 'dr_mutations_' + drug
-other_muts_col = 'other_mutations_' + drug
-dr_type = "resistance"
-
-
-print('Extracting columns based on variables:\n'
-	, drug
-	, '\n'
-    , dr_type
-    , '\n'
-	, dr_muts_col
-	, '\n'
-	, other_muts_col
-	, '\n===============================================================')
-#=======================================================================
-#%% input and output dirs and files
-#=======
-# dirs
-#=======
-datadir = homedir + '/' + 'git/Data'
-indir = datadir + '/' + drug + '/' + 'input'
-outdir = datadir + '/' + drug + '/' + 'output'
-
-#=======
-# input
-#=======
-#in_filename_master_master  = 'original_tanushree_data_v2.csv' #19k
-in_filename_v2  = 'original_tanushree_data_v2.csv' #19k
-infile_master_v2 = datadir + '/' + in_filename_v2
-print('Input file v2: ', infile_master_v2
-      , '\n============================================================')
-
-in_filename_v3  = 'mtb_gwas_meta_v3.csv' #33k
-infile_master_v3 = datadir + '/' + in_filename_v3
-print('Input file v3: ', infile_master_v3
-      , '\n============================================================')
-      
-      
-in_filename_v4  = 'mtb_gwas_meta_v4.csv' #34k
-infile_master_v4 = datadir + '/' + in_filename_v4
-print('Input file v4: ', infile_master_v4
-      , '\n============================================================')
-      
-in_filename_v5  = 'mtb_gwas_meta_v5.csv' #34k
-infile_master_v5 = datadir + '/' + in_filename_v5
-print('Input file v4: ', infile_master_v5
-      , '\n============================================================')
-
-#=======
-# output
-#=======
-# several output files: in respective sections at the time of outputting files
-print('Output filename: in the respective sections'
-      , '\nOutput path: ', outdir
-      , '\n=============================================================')
-
-#%%end of variable assignment for input and output files
-#=======================================================================
-#%% Read input file
-master_data_v2  = pd.read_csv(infile_master_v2, sep = ',', dtype = 'unicode') # ascii
-master_data_v3  = pd.read_csv(infile_master_v3, sep = ',', dtype = 'unicode')  
-master_data_v4  = pd.read_csv(infile_master_v4, sep = ',', dtype = 'unicode') 
-master_data_v5  = pd.read_csv(infile_master_v5, sep = ',', dtype = 'unicode')   
-#DtypeWarning: Columns (48) have mixed types.Specify dtype option on import or set low_memory=False.
-#  interactivity=interactivity, compiler=compiler, result=result)
-
-#==========
-# na_check
-#==========
-#==================================================================
-v2_na = master_data_v2.isna().sum()
-v2_na.name = "v2_na_count"
-v2_na = v2_na.to_frame()
-v2_na['v2_na_percent'] = master_data_v2.isna().mean().round(4)*100
-
-master_data_v2['drtype'].value_counts() 
-master_data_v2['drtype'].value_counts().sum() == len(master_data_v2)
-
-v2 = master_data_v2[['id'
-, 'country'
-, 'lineage'
-, 'sublineage'
-, 'drtype'
-, drug
-, dr_muts_col
-, other_muts_col]]
-
-v2.isna().sum()
-
-print('complete samples v2:', v2['id'].nunique() - v2[drug].isna().sum())
-
-#==================================================================
-v3_na = master_data_v3.isna().sum()
-v3_na.name = "v3_na_count"
-v3_na = v3_na.to_frame()
-v3_na['v3_na_percent'] = master_data_v3.isna().mean().round(4)*100
-
-master_data_v3['resistance'].value_counts() 
-master_data_v3['resistance'].value_counts().sum() == len(master_data_v3)
-
-v3 = master_data_v3[['id'
-, 'country'
-, 'lineage'
-, 'sublineage'
-, 'resistance'
-, drug
-, dr_muts_col
-, other_muts_col]]
-
-v3.isna().sum()
-
-print('complete samples v3:', v3['id'].nunique() - v3[drug].isna().sum())
-
-#==================================================================
-v4_na = master_data_v4.isna().sum()
-v4_na.name = "v4_na_count"
-v4_na = v4_na.to_frame()
-v4_na['v4_na_percent'] = master_data_v4.isna().mean().round(4)*100
-
-v4 = master_data_v4[['id'
-, 'country'
-, 'lineage'
-, 'sublineage'
-, drug
-, dr_muts_col
-, other_muts_col]]
-
-v4.isna().sum()
-
-print('complete samples v4:', v4['id'].nunique() - v4[drug].isna().sum())
-#==================================================================
-v5_na = master_data_v5.isna().sum()
-v5_na.name = "v5_na_count"
-v5_na = v5_na.to_frame()
-v5_na['v4_na_percent'] = master_data_v5.isna().mean().round(4)*100
-
-v5 = master_data_v5[['id'
-, 'country'
-, 'lineage'
-, 'sublineage'
-, drug
-, dr_muts_col
-, other_muts_col]]
-
-v5.isna().sum()
-
-print('complete samples v5:', v5['id'].nunique() - v5[drug].isna().sum())
-
-
-
-
-#====================================================================
-# checking ids
-id_check1 = master_data_v2['id'].isin(master_data_v3['id']).sum()
-print('No. of 19k dataset (v1) ids in 33k dataset (v2):',id_check1)
-
-id_check2 = master_data_v2['id'].isin(master_data_v4['id']).sum()
-print('No. of 19k dataset (v1) ids in 34k dataset (v4):',id_check2)
-id_check3 = master_data_v4['id'].isin(master_data_v2['id']).sum()
-print('No. of 19k dataset (v1) ids in 34k dataset (v4):',id_check3)
-
-id_check4 = master_data_v3['sample_accession'].isin(master_data_v4['sample_accession']).sum()
-print('No. of 33k dataset (v3) ids in 34k dataset (v3):',id_check4)
-id_check5 = master_data_v4['sample_accession'].isin(master_data_v3['sample_accession']).sum()
-print('No. of 34k dataset (v4) ids in 33k dataset (v3):', id_check5)
-
-master_data_v3['sample_accession'].equals(master_data_v3['accession'])
-master_data_v3['sample_accession'].isin(master_data_v3['accession']).sum()
-master_data_v3['accession'].isin(master_data_v3['sample_accession']).sum()
-
-
-master_data_v4['sample_accession'].equals(master_data_v4['accession'])
-master_data_v4['sample_accession'].isin(master_data_v4['accession']).sum()
-master_data_v4['accession'].isin(master_data_v4['sample_accession']).sum()
-
-#===================================================================
-
-
-#====================================================================
-#which v3 cols are NOT IN V4
-master_data_v3.columns[~master_data_v3.columns.isin(master_data_v4.columns)]
-
-# which v4 cols ARE NOT in v3
-master_data_v4.columns[~master_data_v4.columns.isin(master_data_v3.columns)]
-
-
-# job: I need resistance and region in v4 data from v3
-# find mergig cols
-np.intersect1d(master_data_v3.columns, master_data_v4.columns)
-
-
-master_data_v3['id'].nunique() == len(master_data_v3)
-master_data_v3['sample_accession'].nunique() == len(master_data_v3)
-master_data_v3['accession'].nunique() == len(master_data_v3)
-master_data_v3['run_accession'].nunique() == len(master_data_v3)
-
-
-master_data_v4['id'].nunique() == len(master_data_v4)
-master_data_v4['sample_accession'].nunique() == len(master_data_v4)
-master_data_v4['accession'].nunique() == len(master_data_v4)
-master_data_v4['run_accession'].nunique() == len(master_data_v4)
-
-c_v4 = master_data_v4[['id', 'sample', 'sample_accession', 'run_accession', 'accession'
-                       , 'location', 'country', 'geographic_source', 'country_code']]
-c_v4.isna().sum()
-
-c_v4_ids = master_data_v4[['id', 'sample', 'sample_accession', 'run_accession', 'accession']]
-c_v4_ids.isna().sum()
-c_v4_ids.eq(c_v4_ids.iloc[:, 0], axis=0)
-
-c_v3 = master_data_v3[['id', 'sample_accession', 'run_accession','accession'
-                       , 'location', 'country', 'geographic_source', 'region']]
-c_v3.isna().sum()
-c_v3_ids = master_data_v3[['id', 'sample_accession', 'run_accession', 'accession']]
-c_v3_ids.isna().sum()
-c_v3_ids.eq(c_v3_ids.iloc[:, 0], axis=0)
-# comment: id, sample, sample_accession and run_accession seem to have no na
-
-master_data_v4[drug].isna().sum()
-
-
-
-
-
-
-
-
-
-
-
-
-#%% Write file: mCSM muts
-
-
-#%% Write file: gene_metadata (i.e gene_LF1)
-# where each row has UNIQUE mutations NOT unique sample ids
-out_filename_metadata = gene.lower() + '_metadata.csv'
-outfile_metadata = outdir + '/' + out_filename_metadata
-print('Writing file: LF formatted data'
-      , '\nFile:', outfile_metadata
-      , '\n============================================================')
-
-gene_LF1.to_csv(outfile_metadata, header = True, index = False)
-print('Finished writing:', outfile_metadata
-      , '\nNo. of rows:', len(gene_LF1)
-      , '\nNo. of cols:', len(gene_LF1.columns)
-      , '\n=============================================================')
-del(out_filename_metadata)
-
-#%% write file: mCSM style but with repitions for MSA and logo plots
-
-print('Writing file: mCSM style muts for msa',
-      '\nFile:', outfile_msa,
-      '\nmutation format (SNP): {WT}<POS>{MUT}',
-      '\nNo.of lines of msa:', len(all_muts_msa))
-
-all_muts_msa_sorted.to_csv(outfile_msa, header = False, index = False)
-
-print('Finished writing:', outfile_msa
-      , '\nNo. of rows:', len(all_muts_msa)
-      , '\nNo. of cols:', len(all_muts_msa.columns)
-      , '\n=============================================================')
-
-del(out_filename_msa)
-