From 13203e6fe0ed777154ac2431b35775870324b51f Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Fri, 7 Aug 2020 13:34:24 +0100 Subject: [PATCH] added data cjeckings script --- scripts/pre_data_extraction.py | 305 +++++++++++++++++++++++++++++++++ 1 file changed, 305 insertions(+) create mode 100755 scripts/pre_data_extraction.py diff --git a/scripts/pre_data_extraction.py b/scripts/pre_data_extraction.py new file mode 100755 index 0000000..98ea5d9 --- /dev/null +++ b/scripts/pre_data_extraction.py @@ -0,0 +1,305 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +''' +Created on Tue Aug 6 12:56:03 2019 + +@author: tanu +''' + +# FIXME: include error checking to enure you only +# concentrate on positions that have structural info? + +# FIXME: import dirs.py to get the basic dir paths available +#======================================================================= +# TASK: + + +#======================================================================= +#%% load libraries +import os, sys +import re +import pandas as pd +import numpy as np +import argparse +#======================================================================= +#%% homdir and curr dir and local imports +homedir = os.path.expanduser('~') +# set working dir +os.getcwd() +os.chdir(homedir + '/git/LSHTM_analysis/scripts') +os.getcwd() + +# import aa dict +#from reference_dict import my_aa_dict # CHECK DIR STRUC THERE! +#from tidy_split import tidy_split +#======================================================================= +#%% command line args +arg_parser = argparse.ArgumentParser() +arg_parser.add_argument('-d', '--drug', help='drug name', default = None) +arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = None) + + +args = arg_parser.parse_args() +#======================================================================= +#%% variable assignment: input and output paths & filenames +#drug = args.drug +#gene = args.gene + +drug = 'pyrazinamide' +gene = 'pncA' + +gene_match = gene + '_p.' +print('mut pattern for gene', gene, ':', gene_match) + +nssnp_match = gene_match +'[A-Z]{3}[0-9]+[A-Z]{3}' +print('nsSNP for gene', gene, ':', nssnp_match) + +wt_regex = gene_match.lower()+'(\w{3})' +print('wt regex:', wt_regex) + +mut_regex = r'\d+(\w{3})$' +print('mt regex:', mut_regex) + +pos_regex = r'(\d+)' +print('position regex:', pos_regex) + +# building cols to extract +dr_muts_col = 'dr_mutations_' + drug +other_muts_col = 'other_mutations_' + drug +dr_type = "resistance" + + +print('Extracting columns based on variables:\n' + , drug + , '\n' + , dr_type + , '\n' + , dr_muts_col + , '\n' + , other_muts_col + , '\n===============================================================') +#======================================================================= +#%% input and output dirs and files +#======= +# dirs +#======= +datadir = homedir + '/' + 'git/Data' +indir = datadir + '/' + drug + '/' + 'input' +outdir = datadir + '/' + drug + '/' + 'output' + +#======= +# input +#======= +#in_filename_master_master = 'original_tanushree_data_v2.csv' #19k +in_filename_v2 = 'original_tanushree_data_v2.csv' #19k +infile_master_v2 = datadir + '/' + in_filename_v2 +print('Input file v2: ', infile_master_v2 + , '\n============================================================') + +in_filename_v3 = 'mtb_gwas_meta_v3.csv' #33k +infile_master_v3 = datadir + '/' + in_filename_v3 +print('Input file v3: ', infile_master_v3 + , '\n============================================================') + + +in_filename_v4 = 'mtb_gwas_meta_v4.csv' #34k +infile_master_v4 = datadir + '/' + in_filename_v4 +print('Input file v4: ', infile_master_v4 + , '\n============================================================') + +#======= +# output +#======= +# several output files: in respective sections at the time of outputting files +print('Output filename: in the respective sections' + , '\nOutput path: ', outdir + , '\n=============================================================') + +#%%end of variable assignment for input and output files +#======================================================================= +#%% Read input file +master_data_v2 = pd.read_csv(infile_master_v2, sep = ',', dtype = 'unicode') # ascii +master_data_v3 = pd.read_csv(infile_master_v3, sep = ',', dtype = 'unicode') +master_data_v4 = pd.read_csv(infile_master_v4, sep = ',', dtype = 'unicode') +#DtypeWarning: Columns (48) have mixed types.Specify dtype option on import or set low_memory=False. +# interactivity=interactivity, compiler=compiler, result=result) + +#========== +# na_check +#========== +#================================================================== +v2_na = master_data_v2.isna().sum() +v2_na.name = "v2_na_count" +v2_na = v2_na.to_frame() +v2_na['v2_na_percent'] = master_data_v2.isna().mean().round(4)*100 + +master_data_v2['drtype'].value_counts() +master_data_v2['drtype'].value_counts().sum() == len(master_data_v2) + +v2 = master_data_v2[['id' +, 'country' +, 'lineage' +, 'sublineage' +, 'drtype' +, drug +, dr_muts_col +, other_muts_col]] + +v2.isna().sum() + +print('complete samples v2:', v2['id'].nunique() - v2[drug].isna().sum()) + +#================================================================== +v3_na = master_data_v3.isna().sum() +v3_na.name = "v3_na_count" +v3_na = v3_na.to_frame() +v3_na['v3_na_percent'] = master_data_v3.isna().mean().round(4)*100 + +master_data_v3['resistance'].value_counts() +master_data_v3['resistance'].value_counts().sum() == len(master_data_v3) + +v3 = master_data_v3[['id' +, 'country' +, 'lineage' +, 'sublineage' +, 'resistance' +, drug +, dr_muts_col +, other_muts_col]] + +v3.isna().sum() + +print('complete samples v3:', v3['id'].nunique() - v3[drug].isna().sum()) + +#================================================================== +v4_na = master_data_v4.isna().sum() +v4_na.name = "v4_na_count" +v4_na = v4_na.to_frame() +v4_na['v4_na_percent'] = master_data_v4.isna().mean().round(4)*100 + +v4 = master_data_v4[['id' +, 'country' +, 'lineage' +, 'sublineage' +, drug +, dr_muts_col +, other_muts_col]] + +v4.isna().sum() + +print('complete samples v4:', v4['id'].nunique() - v4[drug].isna().sum()) + +#==================================================================== +# checking ids +id_check1 = master_data_v2['id'].isin(master_data_v3['id']).sum() +print('No. of 19k dataset (v1) ids in 33k dataset (v2):',id_check1) + +id_check2 = master_data_v2['id'].isin(master_data_v4['id']).sum() +print('No. of 19k dataset (v1) ids in 34k dataset (v4):',id_check2) +id_check3 = master_data_v4['id'].isin(master_data_v2['id']).sum() +print('No. of 19k dataset (v1) ids in 34k dataset (v4):',id_check3) + +id_check4 = master_data_v3['sample_accession'].isin(master_data_v4['sample_accession']).sum() +print('No. of 33k dataset (v3) ids in 34k dataset (v3):',id_check4) +id_check5 = master_data_v4['sample_accession'].isin(master_data_v3['sample_accession']).sum() +print('No. of 34k dataset (v4) ids in 33k dataset (v3):', id_check5) + +master_data_v3['sample_accession'].equals(master_data_v3['accession']) +master_data_v3['sample_accession'].isin(master_data_v3['accession']).sum() +master_data_v3['accession'].isin(master_data_v3['sample_accession']).sum() + + +master_data_v4['sample_accession'].equals(master_data_v4['accession']) +master_data_v4['sample_accession'].isin(master_data_v4['accession']).sum() +master_data_v4['accession'].isin(master_data_v4['sample_accession']).sum() + +#=================================================================== + + +#==================================================================== +#which v3 cols are NOT IN V4 +master_data_v3.columns[~master_data_v3.columns.isin(master_data_v4.columns)] + +# which v4 cols ARE NOT in v3 +master_data_v4.columns[~master_data_v4.columns.isin(master_data_v3.columns)] + + +# job: I need resistance and region in v4 data from v3 +# find mergig cols +np.intersect1d(master_data_v3.columns, master_data_v4.columns) + + +master_data_v3['id'].nunique() == len(master_data_v3) +master_data_v3['sample_accession'].nunique() == len(master_data_v3) +master_data_v3['accession'].nunique() == len(master_data_v3) +master_data_v3['run_accession'].nunique() == len(master_data_v3) + + +master_data_v4['id'].nunique() == len(master_data_v4) +master_data_v4['sample_accession'].nunique() == len(master_data_v4) +master_data_v4['accession'].nunique() == len(master_data_v4) +master_data_v4['run_accession'].nunique() == len(master_data_v4) + +c_v4 = master_data_v4[['id', 'sample', 'sample_accession', 'run_accession', 'accession' + , 'location', 'country', 'geographic_source', 'country_code']] +c_v4.isna().sum() + +c_v4_ids = master_data_v4[['id', 'sample', 'sample_accession', 'run_accession', 'accession']] +c_v4_ids.isna().sum() +c_v4_ids.eq(c_v4_ids.iloc[:, 0], axis=0) + +c_v3 = master_data_v3[['id', 'sample_accession', 'run_accession','accession' + , 'location', 'country', 'geographic_source', 'region']] +c_v3.isna().sum() +c_v3_ids = master_data_v3[['id', 'sample_accession', 'run_accession', 'accession']] +c_v3_ids.isna().sum() +c_v3_ids.eq(c_v3_ids.iloc[:, 0], axis=0) +# comment: id, sample, sample_accession and run_accession seem to have no na + +master_data_v4[drug].isna().sum() + + + + + + + + + + + + +#%% Write file: mCSM muts + + +#%% Write file: gene_metadata (i.e gene_LF1) +# where each row has UNIQUE mutations NOT unique sample ids +out_filename_metadata = gene.lower() + '_metadata.csv' +outfile_metadata = outdir + '/' + out_filename_metadata +print('Writing file: LF formatted data' + , '\nFile:', outfile_metadata + , '\n============================================================') + +gene_LF1.to_csv(outfile_metadata, header = True, index = False) +print('Finished writing:', outfile_metadata + , '\nNo. of rows:', len(gene_LF1) + , '\nNo. of cols:', len(gene_LF1.columns) + , '\n=============================================================') +del(out_filename_metadata) + +#%% write file: mCSM style but with repitions for MSA and logo plots + +print('Writing file: mCSM style muts for msa', + '\nFile:', outfile_msa, + '\nmutation format (SNP): {WT}{MUT}', + '\nNo.of lines of msa:', len(all_muts_msa)) + +all_muts_msa_sorted.to_csv(outfile_msa, header = False, index = False) + +print('Finished writing:', outfile_msa + , '\nNo. of rows:', len(all_muts_msa) + , '\nNo. of cols:', len(all_muts_msa.columns) + , '\n=============================================================') + +del(out_filename_msa) +