From aca73048c1f49b3cec1455fd435b3d275fac43aa Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Sun, 21 Feb 2021 16:07:33 +0000 Subject: [PATCH] added count.py to count samples for quick checks --- scripts/count.py | 288 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 288 insertions(+) create mode 100755 scripts/count.py diff --git a/scripts/count.py b/scripts/count.py new file mode 100755 index 0000000..444af46 --- /dev/null +++ b/scripts/count.py @@ -0,0 +1,288 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +''' +Created on Tue Aug 6 12:56:03 2019 + +@author: tanu +''' + +# FIXME: include error checking to enure you only +# concentrate on positions that have structural info? + +# FIXME: import dirs.py to get the basic dir paths available +#======================================================================= +# TASK: extract ALL matched mutations from GWAS data +# Input data file has the following format: each row = unique sample id +# id,country,lineage,sublineage,drtype,drug,dr_muts_col,other_muts_col... +# 0,sampleID,USA,lineage2,lineage2.2.1,Drug-resistant,0.0,WT,gene_matchPOS; pncA_c.POS... +# where multiple mutations and multiple mutation types are separated by ';'. +# We are interested in the protein coding region i.e mutation with the_'p.' format. +# This script splits the mutations on the ';' and extracts protein coding muts only +# where each row is a separate mutation +# sample ids AND mutations are NOT unique, but the COMBINATION (sample id + mutation) = unique + + +# output files: all lower case +# 1) _gwas.csv +# 2) _common_ids.csv +# 3) _ambiguous_muts.csv +# 4) _mcsm_formatted_snps.csv +# 5) _metadata_poscounts.csv +# 6) _metadata.csv +# 7) _all_muts_msa.csv +# 8) _mutational_positons.csv + +#------------ +# NOTE +#----------- +# drtype is renamed to 'resistance' in the 35k dataset +# all colnames in the ouput files lowercase + +#------------- +# requires +#------------- +#reference_dict.py +#tidy_split.py +#======================================================================= +#%% load libraries +import os, sys +import re +import pandas as pd +import numpy as np +import argparse +#======================================================================= +#%% dir and local imports +homedir = os.path.expanduser('~') +# set working dir +os.getcwd() +os.chdir(homedir + '/git/LSHTM_analysis/scripts') +os.getcwd() + +#======================================================================= +#%% command line args +arg_parser = argparse.ArgumentParser() +arg_parser.add_argument('-d', '--drug', help='drug name (case sensitive)', default = None) +arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = None) +arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data') +arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + + input') +arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + + output') +arg_parser.add_argument('-m', '--make_dirs', help = 'Make dir for input and output', action='store_true') + +arg_parser.add_argument('--debug', action ='store_true', help = 'Debug Mode') + + +args = arg_parser.parse_args() +#======================================================================= +#%% variable assignment: input and output paths & filenames +drug = args.drug +gene = args.gene +datadir = args.datadir +indir = args.input_dir +outdir = args.output_dir +make_dirs = args.make_dirs + +#drug = 'streptomycin' +#gene = 'gid' + +#%% input and output dirs and files +#======= +# dirs +#======= +if not datadir: + datadir = homedir + '/' + 'git/Data' + +if not indir: + indir = datadir + '/' + drug + '/input' + +if not outdir: + outdir = datadir + '/' + drug + '/output' + +if make_dirs: + print('make_dirs is turned on, creating data dir:', datadir) + try: + os.makedirs(datadir, exist_ok = True) + print("Directory '%s' created successfully" %datadir) + except OSError as error: + print("Directory '%s' can not be created") + + print('make_dirs is turned on, creating indir:', indir) + try: + os.makedirs(indir, exist_ok = True) + print("Directory '%s' created successfully" %indir) + except OSError as error: + print("Directory '%s' can not be created") + + print('make_dirs is turned on, creating outdir:', outdir) + try: + os.makedirs(outdir, exist_ok = True) + print("Directory '%s' created successfully" %outdir) + except OSError as error: + print("Directory '%s' can not be created") + +# handle missing dirs here +if not os.path.isdir(datadir): + print('ERROR: Data directory does not exist:', datadir + , '\nPlease create and ensure gwas data is present and then rerun\nelse specify cmd option --make_dirs') + sys.exit() +if not os.path.isdir(indir): + print('ERROR: Input directory does not exist:', indir + , '\nPlease either create or specify indir and rerun\nelse specify cmd option --make_dirs') + sys.exit() +if not os.path.isdir(outdir): + print('ERROR: Output directory does not exist:', outdir + , '\nPlease create or specify outdir and rerun\nelse specify cmd option --make_dirs') + sys.exit() + +# Requires +from reference_dict import my_aa_dict # CHECK DIR STRUC THERE! +from tidy_split import tidy_split + +#======================================================================= +gene_match = gene + '_p.' +print('mut pattern for gene', gene, ':', gene_match) + +nssnp_match = gene_match +'[A-Za-z]{3}[0-9]+[A-Za-z]{3}' +print('nsSNP for gene', gene, ':', nssnp_match) + +wt_regex = gene_match.lower()+'([A-Za-z]{3})' +print('wt regex:', wt_regex) + +mut_regex = r'[0-9]+(\w{3})$' +print('mt regex:', mut_regex) + +pos_regex = r'([0-9]+)' +print('position regex:', pos_regex) + +# building cols to extract +dr_muts_col = 'dr_mutations_' + drug +other_muts_col = 'other_mutations_' + drug +resistance_col = 'drtype' + +print('Extracting columns based on variables:\n' + , drug + , '\n' + , dr_muts_col + , '\n' + , other_muts_col + , '\n' + , resistance_col + , '\n===============================================================') +#======================================================================= + +#======= +# input +#======= +#in_filename_master_master = 'original_tanushree_data_v2.csv' #19k +in_filename_master = 'mtb_gwas_meta_v6.csv' #35k +infile_master = datadir + '/' + in_filename_master +print('Input file: ', infile_master + , '\n============================================================') + +#======= +# output +#======= +# several output files: in respective sections at the time of outputting files +print('Output filename: in the respective sections' + , '\nOutput path: ', outdir + , '\n=============================================================') + +#%%end of variable assignment for input and output files +#======================================================================= +#%% Read input file +master_data = pd.read_csv(infile_master, sep = ',') + +# column names +#list(master_data.columns) + +# extract elevant columns to extract from meta data related to the drug +if in_filename_master == 'original_tanushree_data_v2.csv': + meta_data = master_data[['id' + , 'country' + , 'lineage' + , 'sublineage' + , 'drtype' + , drug + , dr_muts_col + , other_muts_col]] +else: + core_cols = ['id' + , 'sample' + #, 'patient_id' + #, 'strain' + , 'lineage' + , 'sublineage' + #, 'country' + , 'country_code' + , 'geographic_source' + #, 'region' + #, 'location' + #, 'host_body_site' + #, 'environment_material' + #, 'host_status' + #, 'host_sex' + #, 'submitted_host_sex' + #, 'hiv_status' + #, 'HIV_status' + #, 'tissue_type' + #, 'isolation_source' + , resistance_col] + + variable_based_cols = [drug + , dr_muts_col + , other_muts_col] + + cols_to_extract = core_cols + variable_based_cols + print('Extracting', len(cols_to_extract), 'columns from master data') + + meta_data = master_data[cols_to_extract] + del(master_data, variable_based_cols, cols_to_extract) + +print('Extracted meta data from filename:', in_filename_master + , '\nDim:', meta_data.shape) + +# checks and results +total_samples = meta_data['id'].nunique() +print('RESULT: Total samples:', total_samples + , '\n===========================================================') + +# counts NA per column +meta_data.isna().sum() +print('No. of NAs/column:' + '\n', meta_data.isna().sum() + , '\n===========================================================\n') + +#%% Write check file +check_file = outdir + '/' + gene.lower() + '_gwas.csv' +meta_data.to_csv(check_file, index = False) +print('\n----------------------------------' + , '\nWriting subsetted gwas data:', gene + , '\n----------------------------------' + , '\nFile', check_file + , '\nDim:', meta_data.shape) + +# equivalent of table in R +# drug counts: complete samples for OR calcs +meta_data[drug].value_counts() +print('===========================================================\n' + , 'RESULT: No. of Sus and Res', drug, 'samples:\n', meta_data[drug].value_counts() + , '\n===========================================================\n' + , 'RESULT: Percentage of Sus and Res', drug, 'samples:\n', meta_data[drug].value_counts(normalize = True)*100 + , '\n===========================================================') +#%% +#!!!!!!!!!! +foo = meta_data[dr_muts_col].value_counts() +foo = foo.reset_index(name = 'values') +foo.columns = [dr_muts_col, 'dr_muts_count'] #171 +foo_count = foo.loc[foo[dr_muts_col].str.contains('del', na = False, regex = True, case = False) ] + +bar = meta_data[other_muts_col].value_counts() +bar = bar.reset_index(name = 'values') +bar.columns = [other_muts_col, 'dr_muts_count'] #64 +bar_count = bar.loc[bar[other_muts_col].str.contains('del', na = False, regex = True, case = False)] + + +tot = len(foo_count) + len(bar_count) +n_del = tot/len(meta_data) +n_del*100 + + +baz = meta_data.loc[meta_data[dr_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) | meta_data[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) ]