From 2d8cb01cb72fcad685b8f2025abea4a73250772e Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Tue, 11 Aug 2020 18:34:02 +0100 Subject: [PATCH] added output file for checking --- scripts/data_extraction.py | 59 +++--- scripts/pre_data_extraction.py | 331 --------------------------------- 2 files changed, 38 insertions(+), 352 deletions(-) delete mode 100755 scripts/pre_data_extraction.py diff --git a/scripts/data_extraction.py b/scripts/data_extraction.py index a682fbc..262175e 100755 --- a/scripts/data_extraction.py +++ b/scripts/data_extraction.py @@ -52,7 +52,7 @@ Created on Tue Aug 6 12:56:03 2019 import os, sys import re import pandas as pd -#import numpy as np +import numpy as np import argparse #======================================================================= #%% homdir and curr dir and local imports @@ -68,18 +68,17 @@ from tidy_split import tidy_split #======================================================================= #%% command line args arg_parser = argparse.ArgumentParser() -arg_parser.add_argument('-d', '--drug', help='drug name', default = None) +arg_parser.add_argument('-d', '--drug', help='drug name (case sensitive)', default = None) arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = None) - args = arg_parser.parse_args() #======================================================================= #%% variable assignment: input and output paths & filenames -#drug = args.drug -#gene = args.gene +drug = args.drug +gene = args.gene -drug = 'pyrazinamide' -gene = 'pncA' +#drug = 'pyrazinamide' +#gene = 'pncA' gene_match = gene + '_p.' print('mut pattern for gene', gene, ':', gene_match) @@ -99,6 +98,7 @@ print('position regex:', pos_regex) # building cols to extract dr_muts_col = 'dr_mutations_' + drug other_muts_col = 'other_mutations_' + drug +resistance_col = 'drtype' print('Extracting columns based on variables:\n' , drug @@ -106,6 +106,8 @@ print('Extracting columns based on variables:\n' , dr_muts_col , '\n' , other_muts_col + , '\n' + , resistance_col , '\n===============================================================') #======================================================================= #%% input and output dirs and files @@ -120,7 +122,7 @@ outdir = datadir + '/' + drug + '/' + 'output' # input #======= #in_filename_master_master = 'original_tanushree_data_v2.csv' #19k -in_filename_master = 'mtb_gwas_meta_v3.csv' #33k +in_filename_master = 'mtb_gwas_meta_v5.csv' #34k infile_master = datadir + '/' + in_filename_master print('Input file: ', infile_master , '\n============================================================') @@ -147,33 +149,37 @@ if in_filename_master == 'original_tanushree_data_v2.csv': , 'country' , 'lineage' , 'sublineage' - , 'drtype' #19k only + , 'drtype' , drug , dr_muts_col , other_muts_col]] -if in_filename_master == 'mtb_gwas_meta_v3.csv': +if in_filename_master == 'mtb_gwas_meta_v5.csv': core_cols = ['id' - , 'country' - , 'country2' - , 'geographic_source' - , 'region' - , 'date' + , 'sample' + , 'patient_id' , 'strain' , 'lineage' - , 'sublineage' #drtype renamed to resistance - , 'resistance' + , 'sublineage' + , 'country' + , 'country_code' + , 'geographic_source' + #, 'region' , 'location' , 'host_body_site' , 'environment_material' , 'host_status' + , 'host_sex' + , 'submitted_host_sex' , 'hiv_status' , 'HIV_status' + , 'tissue_type' , 'isolation_source'] variable_based_cols = [drug , dr_muts_col - , other_muts_col] + , other_muts_col + , resistance_col] cols_to_extract = core_cols + variable_based_cols print('Extracting', len(cols_to_extract), 'columns from master data') @@ -193,7 +199,14 @@ print('RESULT: Total samples:', total_samples meta_data.isna().sum() print('No. of NAs/column:' + '\n', meta_data.isna().sum() , '\n===========================================================') -# + +#%% Write check file +check_file = outdir + '/' + gene.lower() + '_gwas.csv' +meta_data.to_csv(check_file) +print('Writing subsetted gwas data' + , '\nFile', check_file + , '\nDim:', meta_data.shape) + # glance #meta_data.head() #total_samples - NA pyrazinamide = ? @@ -203,7 +216,10 @@ print('No. of NAs/column:' + '\n', meta_data.isna().sum() # equivalent of table in R # drug counts: complete samples for OR calcs meta_data[drug].value_counts() -print('RESULT: Sus and Res samples:\n', meta_data[drug].value_counts() +print('===========================================================\n' + , 'RESULT: No. of Sus and Res samples:\n', meta_data[drug].value_counts() + , '\n===========================================================\n' + , 'RESULT: Percentage of Sus and Res samples:\n', meta_data[drug].value_counts(normalize = True)*100 , '\n===========================================================') #%% @@ -306,7 +322,8 @@ print('Predicting total no. of rows in the curated df:', dr_gene_count + other_g , '\n===================================================================') expected_rows = dr_gene_count + other_gene_count -del(i, id, wt_other, clean_df, na_count, id2_other, count_gene_other, count_wt) +#del( wt_other, clean_df, i, id, na_count, id2_other, count_gene_other, count_wt) +del(clean_df, na_count, i, id, wt_other, id2_other, count_gene_other,count_wt ) #%% ############ diff --git a/scripts/pre_data_extraction.py b/scripts/pre_data_extraction.py deleted file mode 100755 index 876299b..0000000 --- a/scripts/pre_data_extraction.py +++ /dev/null @@ -1,331 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -''' -Created on Tue Aug 6 12:56:03 2019 - -@author: tanu -''' - -# FIXME: include error checking to enure you only -# concentrate on positions that have structural info? - -# FIXME: import dirs.py to get the basic dir paths available -#======================================================================= -# TASK: - - -#======================================================================= -#%% load libraries -import os, sys -import re -import pandas as pd -import numpy as np -import argparse -#======================================================================= -#%% homdir and curr dir and local imports -homedir = os.path.expanduser('~') -# set working dir -os.getcwd() -os.chdir(homedir + '/git/LSHTM_analysis/scripts') -os.getcwd() - -# import aa dict -#from reference_dict import my_aa_dict # CHECK DIR STRUC THERE! -#from tidy_split import tidy_split -#======================================================================= -#%% command line args -arg_parser = argparse.ArgumentParser() -arg_parser.add_argument('-d', '--drug', help='drug name', default = None) -arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = None) - - -args = arg_parser.parse_args() -#======================================================================= -#%% variable assignment: input and output paths & filenames -#drug = args.drug -#gene = args.gene - -drug = 'pyrazinamide' -gene = 'pncA' - -gene_match = gene + '_p.' -print('mut pattern for gene', gene, ':', gene_match) - -nssnp_match = gene_match +'[A-Z]{3}[0-9]+[A-Z]{3}' -print('nsSNP for gene', gene, ':', nssnp_match) - -wt_regex = gene_match.lower()+'(\w{3})' -print('wt regex:', wt_regex) - -mut_regex = r'\d+(\w{3})$' -print('mt regex:', mut_regex) - -pos_regex = r'(\d+)' -print('position regex:', pos_regex) - -# building cols to extract -dr_muts_col = 'dr_mutations_' + drug -other_muts_col = 'other_mutations_' + drug -dr_type = "resistance" - - -print('Extracting columns based on variables:\n' - , drug - , '\n' - , dr_type - , '\n' - , dr_muts_col - , '\n' - , other_muts_col - , '\n===============================================================') -#======================================================================= -#%% input and output dirs and files -#======= -# dirs -#======= -datadir = homedir + '/' + 'git/Data' -indir = datadir + '/' + drug + '/' + 'input' -outdir = datadir + '/' + drug + '/' + 'output' - -#======= -# input -#======= -#in_filename_master_master = 'original_tanushree_data_v2.csv' #19k -in_filename_v2 = 'original_tanushree_data_v2.csv' #19k -infile_master_v2 = datadir + '/' + in_filename_v2 -print('Input file v2: ', infile_master_v2 - , '\n============================================================') - -in_filename_v3 = 'mtb_gwas_meta_v3.csv' #33k -infile_master_v3 = datadir + '/' + in_filename_v3 -print('Input file v3: ', infile_master_v3 - , '\n============================================================') - - -in_filename_v4 = 'mtb_gwas_meta_v4.csv' #34k -infile_master_v4 = datadir + '/' + in_filename_v4 -print('Input file v4: ', infile_master_v4 - , '\n============================================================') - -in_filename_v5 = 'mtb_gwas_meta_v5.csv' #34k -infile_master_v5 = datadir + '/' + in_filename_v5 -print('Input file v4: ', infile_master_v5 - , '\n============================================================') - -#======= -# output -#======= -# several output files: in respective sections at the time of outputting files -print('Output filename: in the respective sections' - , '\nOutput path: ', outdir - , '\n=============================================================') - -#%%end of variable assignment for input and output files -#======================================================================= -#%% Read input file -master_data_v2 = pd.read_csv(infile_master_v2, sep = ',', dtype = 'unicode') # ascii -master_data_v3 = pd.read_csv(infile_master_v3, sep = ',', dtype = 'unicode') -master_data_v4 = pd.read_csv(infile_master_v4, sep = ',', dtype = 'unicode') -master_data_v5 = pd.read_csv(infile_master_v5, sep = ',', dtype = 'unicode') -#DtypeWarning: Columns (48) have mixed types.Specify dtype option on import or set low_memory=False. -# interactivity=interactivity, compiler=compiler, result=result) - -#========== -# na_check -#========== -#================================================================== -v2_na = master_data_v2.isna().sum() -v2_na.name = "v2_na_count" -v2_na = v2_na.to_frame() -v2_na['v2_na_percent'] = master_data_v2.isna().mean().round(4)*100 - -master_data_v2['drtype'].value_counts() -master_data_v2['drtype'].value_counts().sum() == len(master_data_v2) - -v2 = master_data_v2[['id' -, 'country' -, 'lineage' -, 'sublineage' -, 'drtype' -, drug -, dr_muts_col -, other_muts_col]] - -v2.isna().sum() - -print('complete samples v2:', v2['id'].nunique() - v2[drug].isna().sum()) - -#================================================================== -v3_na = master_data_v3.isna().sum() -v3_na.name = "v3_na_count" -v3_na = v3_na.to_frame() -v3_na['v3_na_percent'] = master_data_v3.isna().mean().round(4)*100 - -master_data_v3['resistance'].value_counts() -master_data_v3['resistance'].value_counts().sum() == len(master_data_v3) - -v3 = master_data_v3[['id' -, 'country' -, 'lineage' -, 'sublineage' -, 'resistance' -, drug -, dr_muts_col -, other_muts_col]] - -v3.isna().sum() - -print('complete samples v3:', v3['id'].nunique() - v3[drug].isna().sum()) - -#================================================================== -v4_na = master_data_v4.isna().sum() -v4_na.name = "v4_na_count" -v4_na = v4_na.to_frame() -v4_na['v4_na_percent'] = master_data_v4.isna().mean().round(4)*100 - -v4 = master_data_v4[['id' -, 'country' -, 'lineage' -, 'sublineage' -, drug -, dr_muts_col -, other_muts_col]] - -v4.isna().sum() - -print('complete samples v4:', v4['id'].nunique() - v4[drug].isna().sum()) -#================================================================== -v5_na = master_data_v5.isna().sum() -v5_na.name = "v5_na_count" -v5_na = v5_na.to_frame() -v5_na['v4_na_percent'] = master_data_v5.isna().mean().round(4)*100 - -v5 = master_data_v5[['id' -, 'country' -, 'lineage' -, 'sublineage' -, drug -, dr_muts_col -, other_muts_col]] - -v5.isna().sum() - -print('complete samples v5:', v5['id'].nunique() - v5[drug].isna().sum()) - - - - -#==================================================================== -# checking ids -id_check1 = master_data_v2['id'].isin(master_data_v3['id']).sum() -print('No. of 19k dataset (v1) ids in 33k dataset (v2):',id_check1) - -id_check2 = master_data_v2['id'].isin(master_data_v4['id']).sum() -print('No. of 19k dataset (v1) ids in 34k dataset (v4):',id_check2) -id_check3 = master_data_v4['id'].isin(master_data_v2['id']).sum() -print('No. of 19k dataset (v1) ids in 34k dataset (v4):',id_check3) - -id_check4 = master_data_v3['sample_accession'].isin(master_data_v4['sample_accession']).sum() -print('No. of 33k dataset (v3) ids in 34k dataset (v3):',id_check4) -id_check5 = master_data_v4['sample_accession'].isin(master_data_v3['sample_accession']).sum() -print('No. of 34k dataset (v4) ids in 33k dataset (v3):', id_check5) - -master_data_v3['sample_accession'].equals(master_data_v3['accession']) -master_data_v3['sample_accession'].isin(master_data_v3['accession']).sum() -master_data_v3['accession'].isin(master_data_v3['sample_accession']).sum() - - -master_data_v4['sample_accession'].equals(master_data_v4['accession']) -master_data_v4['sample_accession'].isin(master_data_v4['accession']).sum() -master_data_v4['accession'].isin(master_data_v4['sample_accession']).sum() - -#=================================================================== - - -#==================================================================== -#which v3 cols are NOT IN V4 -master_data_v3.columns[~master_data_v3.columns.isin(master_data_v4.columns)] - -# which v4 cols ARE NOT in v3 -master_data_v4.columns[~master_data_v4.columns.isin(master_data_v3.columns)] - - -# job: I need resistance and region in v4 data from v3 -# find mergig cols -np.intersect1d(master_data_v3.columns, master_data_v4.columns) - - -master_data_v3['id'].nunique() == len(master_data_v3) -master_data_v3['sample_accession'].nunique() == len(master_data_v3) -master_data_v3['accession'].nunique() == len(master_data_v3) -master_data_v3['run_accession'].nunique() == len(master_data_v3) - - -master_data_v4['id'].nunique() == len(master_data_v4) -master_data_v4['sample_accession'].nunique() == len(master_data_v4) -master_data_v4['accession'].nunique() == len(master_data_v4) -master_data_v4['run_accession'].nunique() == len(master_data_v4) - -c_v4 = master_data_v4[['id', 'sample', 'sample_accession', 'run_accession', 'accession' - , 'location', 'country', 'geographic_source', 'country_code']] -c_v4.isna().sum() - -c_v4_ids = master_data_v4[['id', 'sample', 'sample_accession', 'run_accession', 'accession']] -c_v4_ids.isna().sum() -c_v4_ids.eq(c_v4_ids.iloc[:, 0], axis=0) - -c_v3 = master_data_v3[['id', 'sample_accession', 'run_accession','accession' - , 'location', 'country', 'geographic_source', 'region']] -c_v3.isna().sum() -c_v3_ids = master_data_v3[['id', 'sample_accession', 'run_accession', 'accession']] -c_v3_ids.isna().sum() -c_v3_ids.eq(c_v3_ids.iloc[:, 0], axis=0) -# comment: id, sample, sample_accession and run_accession seem to have no na - -master_data_v4[drug].isna().sum() - - - - - - - - - - - - -#%% Write file: mCSM muts - - -#%% Write file: gene_metadata (i.e gene_LF1) -# where each row has UNIQUE mutations NOT unique sample ids -out_filename_metadata = gene.lower() + '_metadata.csv' -outfile_metadata = outdir + '/' + out_filename_metadata -print('Writing file: LF formatted data' - , '\nFile:', outfile_metadata - , '\n============================================================') - -gene_LF1.to_csv(outfile_metadata, header = True, index = False) -print('Finished writing:', outfile_metadata - , '\nNo. of rows:', len(gene_LF1) - , '\nNo. of cols:', len(gene_LF1.columns) - , '\n=============================================================') -del(out_filename_metadata) - -#%% write file: mCSM style but with repitions for MSA and logo plots - -print('Writing file: mCSM style muts for msa', - '\nFile:', outfile_msa, - '\nmutation format (SNP): {WT}{MUT}', - '\nNo.of lines of msa:', len(all_muts_msa)) - -all_muts_msa_sorted.to_csv(outfile_msa, header = False, index = False) - -print('Finished writing:', outfile_msa - , '\nNo. of rows:', len(all_muts_msa) - , '\nNo. of cols:', len(all_muts_msa.columns) - , '\n=============================================================') - -del(out_filename_msa) -