added output file for checking
This commit is contained in:
parent
dbf8865203
commit
833e599550
2 changed files with 38 additions and 352 deletions
|
@ -52,7 +52,7 @@ Created on Tue Aug 6 12:56:03 2019
|
|||
import os, sys
|
||||
import re
|
||||
import pandas as pd
|
||||
#import numpy as np
|
||||
import numpy as np
|
||||
import argparse
|
||||
#=======================================================================
|
||||
#%% homdir and curr dir and local imports
|
||||
|
@ -68,18 +68,17 @@ from tidy_split import tidy_split
|
|||
#=======================================================================
|
||||
#%% command line args
|
||||
arg_parser = argparse.ArgumentParser()
|
||||
arg_parser.add_argument('-d', '--drug', help='drug name', default = None)
|
||||
arg_parser.add_argument('-d', '--drug', help='drug name (case sensitive)', default = None)
|
||||
arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = None)
|
||||
|
||||
|
||||
args = arg_parser.parse_args()
|
||||
#=======================================================================
|
||||
#%% variable assignment: input and output paths & filenames
|
||||
#drug = args.drug
|
||||
#gene = args.gene
|
||||
drug = args.drug
|
||||
gene = args.gene
|
||||
|
||||
drug = 'pyrazinamide'
|
||||
gene = 'pncA'
|
||||
#drug = 'pyrazinamide'
|
||||
#gene = 'pncA'
|
||||
|
||||
gene_match = gene + '_p.'
|
||||
print('mut pattern for gene', gene, ':', gene_match)
|
||||
|
@ -99,6 +98,7 @@ print('position regex:', pos_regex)
|
|||
# building cols to extract
|
||||
dr_muts_col = 'dr_mutations_' + drug
|
||||
other_muts_col = 'other_mutations_' + drug
|
||||
resistance_col = 'drtype'
|
||||
|
||||
print('Extracting columns based on variables:\n'
|
||||
, drug
|
||||
|
@ -106,6 +106,8 @@ print('Extracting columns based on variables:\n'
|
|||
, dr_muts_col
|
||||
, '\n'
|
||||
, other_muts_col
|
||||
, '\n'
|
||||
, resistance_col
|
||||
, '\n===============================================================')
|
||||
#=======================================================================
|
||||
#%% input and output dirs and files
|
||||
|
@ -120,7 +122,7 @@ outdir = datadir + '/' + drug + '/' + 'output'
|
|||
# input
|
||||
#=======
|
||||
#in_filename_master_master = 'original_tanushree_data_v2.csv' #19k
|
||||
in_filename_master = 'mtb_gwas_meta_v3.csv' #33k
|
||||
in_filename_master = 'mtb_gwas_meta_v5.csv' #34k
|
||||
infile_master = datadir + '/' + in_filename_master
|
||||
print('Input file: ', infile_master
|
||||
, '\n============================================================')
|
||||
|
@ -147,33 +149,37 @@ if in_filename_master == 'original_tanushree_data_v2.csv':
|
|||
, 'country'
|
||||
, 'lineage'
|
||||
, 'sublineage'
|
||||
, 'drtype' #19k only
|
||||
, 'drtype'
|
||||
, drug
|
||||
, dr_muts_col
|
||||
, other_muts_col]]
|
||||
|
||||
if in_filename_master == 'mtb_gwas_meta_v3.csv':
|
||||
if in_filename_master == 'mtb_gwas_meta_v5.csv':
|
||||
core_cols = ['id'
|
||||
, 'country'
|
||||
, 'country2'
|
||||
, 'geographic_source'
|
||||
, 'region'
|
||||
, 'date'
|
||||
, 'sample'
|
||||
, 'patient_id'
|
||||
, 'strain'
|
||||
, 'lineage'
|
||||
, 'sublineage' #drtype renamed to resistance
|
||||
, 'resistance'
|
||||
, 'sublineage'
|
||||
, 'country'
|
||||
, 'country_code'
|
||||
, 'geographic_source'
|
||||
#, 'region'
|
||||
, 'location'
|
||||
, 'host_body_site'
|
||||
, 'environment_material'
|
||||
, 'host_status'
|
||||
, 'host_sex'
|
||||
, 'submitted_host_sex'
|
||||
, 'hiv_status'
|
||||
, 'HIV_status'
|
||||
, 'tissue_type'
|
||||
, 'isolation_source']
|
||||
|
||||
variable_based_cols = [drug
|
||||
, dr_muts_col
|
||||
, other_muts_col]
|
||||
, other_muts_col
|
||||
, resistance_col]
|
||||
|
||||
cols_to_extract = core_cols + variable_based_cols
|
||||
print('Extracting', len(cols_to_extract), 'columns from master data')
|
||||
|
@ -193,7 +199,14 @@ print('RESULT: Total samples:', total_samples
|
|||
meta_data.isna().sum()
|
||||
print('No. of NAs/column:' + '\n', meta_data.isna().sum()
|
||||
, '\n===========================================================')
|
||||
#
|
||||
|
||||
#%% Write check file
|
||||
check_file = outdir + '/' + gene.lower() + '_gwas.csv'
|
||||
meta_data.to_csv(check_file)
|
||||
print('Writing subsetted gwas data'
|
||||
, '\nFile', check_file
|
||||
, '\nDim:', meta_data.shape)
|
||||
|
||||
# glance
|
||||
#meta_data.head()
|
||||
#total_samples - NA pyrazinamide = ?
|
||||
|
@ -203,7 +216,10 @@ print('No. of NAs/column:' + '\n', meta_data.isna().sum()
|
|||
# equivalent of table in R
|
||||
# drug counts: complete samples for OR calcs
|
||||
meta_data[drug].value_counts()
|
||||
print('RESULT: Sus and Res samples:\n', meta_data[drug].value_counts()
|
||||
print('===========================================================\n'
|
||||
, 'RESULT: No. of Sus and Res samples:\n', meta_data[drug].value_counts()
|
||||
, '\n===========================================================\n'
|
||||
, 'RESULT: Percentage of Sus and Res samples:\n', meta_data[drug].value_counts(normalize = True)*100
|
||||
, '\n===========================================================')
|
||||
|
||||
#%%
|
||||
|
@ -306,7 +322,8 @@ print('Predicting total no. of rows in the curated df:', dr_gene_count + other_g
|
|||
, '\n===================================================================')
|
||||
expected_rows = dr_gene_count + other_gene_count
|
||||
|
||||
del(i, id, wt_other, clean_df, na_count, id2_other, count_gene_other, count_wt)
|
||||
#del( wt_other, clean_df, i, id, na_count, id2_other, count_gene_other, count_wt)
|
||||
del(clean_df, na_count, i, id, wt_other, id2_other, count_gene_other,count_wt )
|
||||
|
||||
#%%
|
||||
############
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue