From aca73048c1f49b3cec1455fd435b3d275fac43aa Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Sun, 21 Feb 2021 16:07:33 +0000
Subject: [PATCH] added count.py to count samples for quick checks

---
 scripts/count.py | 288 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 288 insertions(+)
 create mode 100755 scripts/count.py
diff --git a/scripts/count.py b/scripts/count.py
new file mode 100755
index 0000000..444af46
--- /dev/null
+++ b/scripts/count.py
@@ -0,0 +1,288 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+'''
+Created on Tue Aug  6 12:56:03 2019
+
+@author: tanu
+'''
+
+# FIXME: include error checking to enure you only
+# concentrate on positions that have structural info?
+
+# FIXME: import dirs.py to get the basic dir paths available
+#=======================================================================
+# TASK: extract ALL <gene> matched mutations from GWAS data
+# Input data file has the following format: each row = unique sample id 
+# id,country,lineage,sublineage,drtype,drug,dr_muts_col,other_muts_col...
+# 0,sampleID,USA,lineage2,lineage2.2.1,Drug-resistant,0.0,WT,gene_match<wt>POS<mut>; pncA_c.<wt>POS<mut>...
+# where multiple mutations and multiple mutation types are separated by ';'. 
+# We are interested in the protein coding region i.e mutation with the<gene>_'p.' format.
+# This script splits the mutations on the ';' and extracts protein coding muts only
+# where each row is a separate mutation
+# sample ids AND mutations are NOT unique, but the COMBINATION (sample id + mutation) = unique
+
+
+# output files: all lower case
+# 1) <gene>_gwas.csv
+# 2) <gene>_common_ids.csv
+# 3) <gene>_ambiguous_muts.csv
+# 4) <gene>_mcsm_formatted_snps.csv
+# 5) <gene>_metadata_poscounts.csv
+# 6) <gene>_metadata.csv
+# 7) <gene>_all_muts_msa.csv
+# 8) <gene>_mutational_positons.csv
+
+#------------
+# NOTE
+#-----------
+# drtype is renamed to 'resistance' in the 35k dataset
+# all colnames in the ouput files lowercase
+
+#-------------
+# requires
+#-------------
+#reference_dict.py
+#tidy_split.py
+#=======================================================================
+#%% load libraries
+import os, sys
+import re
+import pandas as pd
+import numpy as np
+import argparse
+#=======================================================================
+#%% dir and local imports
+homedir = os.path.expanduser('~') 
+# set working dir
+os.getcwd()
+os.chdir(homedir + '/git/LSHTM_analysis/scripts')
+os.getcwd()
+
+#=======================================================================
+#%% command line args
+arg_parser = argparse.ArgumentParser()
+arg_parser.add_argument('-d', '--drug', help='drug name (case sensitive)', default = None)
+arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = None)
+arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data')
+arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
+arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
+arg_parser.add_argument('-m', '--make_dirs', help = 'Make dir for input and output', action='store_true')
+
+arg_parser.add_argument('--debug', action ='store_true', help = 'Debug Mode')
+
+
+args = arg_parser.parse_args()
+#=======================================================================
+#%% variable assignment: input and output paths & filenames
+drug = args.drug
+gene = args.gene
+datadir = args.datadir
+indir   = args.input_dir
+outdir  = args.output_dir
+make_dirs  = args.make_dirs
+
+#drug = 'streptomycin'
+#gene = 'gid'
+
+#%% input and output dirs and files
+#=======
+# dirs
+#=======
+if not datadir:
+    datadir = homedir + '/' + 'git/Data'
+    
+if not indir:
+    indir = datadir + '/' + drug + '/input'
+    
+if not outdir:
+    outdir = datadir + '/' + drug + '/output'
+
+if make_dirs:
+    print('make_dirs is turned on, creating data dir:', datadir)
+    try: 
+        os.makedirs(datadir, exist_ok = True) 
+        print("Directory '%s' created successfully" %datadir) 
+    except OSError as error: 
+        print("Directory '%s' can not be created") 
+  
+    print('make_dirs is turned on, creating indir:', indir)
+    try: 
+        os.makedirs(indir, exist_ok = True) 
+        print("Directory '%s' created successfully" %indir) 
+    except OSError as error: 
+        print("Directory '%s' can not be created") 
+  
+    print('make_dirs is turned on, creating outdir:', outdir)
+    try: 
+        os.makedirs(outdir, exist_ok = True) 
+        print("Directory '%s' created successfully" %outdir) 
+    except OSError as error: 
+        print("Directory '%s' can not be created") 
+
+# handle missing dirs here
+if not os.path.isdir(datadir):
+    print('ERROR: Data directory does not exist:', datadir
+    , '\nPlease create and ensure gwas data is present and then rerun\nelse specify cmd option --make_dirs')
+    sys.exit()
+if not os.path.isdir(indir):
+    print('ERROR: Input directory does not exist:', indir
+    , '\nPlease either create or specify indir and rerun\nelse specify cmd option --make_dirs')
+    sys.exit()
+if not os.path.isdir(outdir):
+    print('ERROR: Output directory does not exist:', outdir
+    , '\nPlease create or specify outdir and rerun\nelse specify cmd option --make_dirs')
+    sys.exit()
+    
+# Requires
+from reference_dict import my_aa_dict # CHECK DIR STRUC THERE!
+from tidy_split import tidy_split
+
+#=======================================================================    
+gene_match = gene + '_p.'
+print('mut pattern for gene', gene, ':',  gene_match)
+
+nssnp_match = gene_match +'[A-Za-z]{3}[0-9]+[A-Za-z]{3}'
+print('nsSNP for gene', gene, ':',  nssnp_match)
+
+wt_regex = gene_match.lower()+'([A-Za-z]{3})'
+print('wt regex:', wt_regex)
+
+mut_regex = r'[0-9]+(\w{3})$'
+print('mt regex:', mut_regex)
+
+pos_regex = r'([0-9]+)'
+print('position regex:', pos_regex)
+
+# building cols to extract
+dr_muts_col = 'dr_mutations_' + drug
+other_muts_col = 'other_mutations_' + drug
+resistance_col = 'drtype'
+
+print('Extracting columns based on variables:\n'
+	, drug
+	, '\n'
+	, dr_muts_col
+	, '\n'
+	, other_muts_col
+    , '\n'
+    , resistance_col
+	, '\n===============================================================')
+#=======================================================================
+
+#=======
+# input
+#=======
+#in_filename_master_master  = 'original_tanushree_data_v2.csv' #19k
+in_filename_master  = 'mtb_gwas_meta_v6.csv' #35k
+infile_master = datadir + '/' + in_filename_master
+print('Input file: ', infile_master
+      , '\n============================================================')
+
+#=======
+# output
+#=======
+# several output files: in respective sections at the time of outputting files
+print('Output filename: in the respective sections'
+      , '\nOutput path: ', outdir
+      , '\n=============================================================')
+
+#%%end of variable assignment for input and output files
+#=======================================================================
+#%% Read input file
+master_data  = pd.read_csv(infile_master, sep = ',')  
+
+# column names
+#list(master_data.columns)
+
+# extract elevant columns to extract from meta data related to the drug
+if in_filename_master == 'original_tanushree_data_v2.csv':
+    meta_data = master_data[['id'
+                             , 'country'
+                             , 'lineage'
+                             , 'sublineage'
+                             , 'drtype'
+                             , drug
+                             , dr_muts_col
+                             , other_muts_col]]
+else:
+    core_cols = ['id'
+                 , 'sample'
+                 #, 'patient_id'
+                 #, 'strain'
+                 , 'lineage'
+                 , 'sublineage' 
+                 #, 'country'
+                 , 'country_code'
+                 , 'geographic_source'
+                 #, 'region'
+                 #, 'location'
+                 #, 'host_body_site'
+                 #, 'environment_material'
+                 #, 'host_status'
+                 #, 'host_sex'
+                 #, 'submitted_host_sex'
+                 #, 'hiv_status'
+                 #, 'HIV_status'
+                 #, 'tissue_type'
+                 #, 'isolation_source'
+                 ,  resistance_col]
+    
+    variable_based_cols = [drug
+                           , dr_muts_col
+                           , other_muts_col]
+       
+    cols_to_extract = core_cols + variable_based_cols
+    print('Extracting', len(cols_to_extract), 'columns from master data')
+
+    meta_data = master_data[cols_to_extract]  
+    del(master_data, variable_based_cols, cols_to_extract)
+
+print('Extracted meta data from filename:', in_filename_master
+      , '\nDim:', meta_data.shape)
+
+# checks and results
+total_samples = meta_data['id'].nunique() 
+print('RESULT: Total samples:', total_samples
+		, '\n===========================================================')
+
+# counts NA per column
+meta_data.isna().sum()
+print('No. of NAs/column:' + '\n', meta_data.isna().sum()
+		, '\n===========================================================\n')
+
+#%% Write check file
+check_file = outdir + '/' + gene.lower() + '_gwas.csv'
+meta_data.to_csv(check_file, index = False)
+print('\n----------------------------------'
+      , '\nWriting subsetted gwas data:', gene
+      , '\n----------------------------------'
+      , '\nFile', check_file
+      , '\nDim:', meta_data.shape)
+
+# equivalent of table in R
+# drug counts: complete samples for OR calcs
+meta_data[drug].value_counts() 
+print('===========================================================\n'
+      , 'RESULT: No. of Sus and Res', drug, 'samples:\n', meta_data[drug].value_counts()
+	  , '\n===========================================================\n'
+      , 'RESULT: Percentage of Sus and Res', drug, 'samples:\n', meta_data[drug].value_counts(normalize = True)*100
+		, '\n===========================================================')
+#%%
+#!!!!!!!!!!
+foo = meta_data[dr_muts_col].value_counts()
+foo = foo.reset_index(name = 'values')
+foo.columns = [dr_muts_col, 'dr_muts_count'] #171
+foo_count = foo.loc[foo[dr_muts_col].str.contains('del', na = False, regex = True, case = False) ]
+
+bar = meta_data[other_muts_col].value_counts()
+bar = bar.reset_index(name = 'values')
+bar.columns = [other_muts_col, 'dr_muts_count'] #64
+bar_count = bar.loc[bar[other_muts_col].str.contains('del', na = False, regex = True, case = False)]
+
+
+tot = len(foo_count) + len(bar_count)
+n_del = tot/len(meta_data)
+n_del*100
+
+
+baz = meta_data.loc[meta_data[dr_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) | meta_data[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) ]