From 8e16b2635eb9b72f5f6e0e968d34092d925c9966 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Tue, 29 Sep 2020 16:09:54 +0100
Subject: [PATCH] added ../data_extraction_epistasis.py for getting list for
 epistasis work

---
 scripts/data_extraction_epistasis.py | 304 +++++++++++++++++++++++++++
 1 file changed, 304 insertions(+)
 create mode 100755 scripts/data_extraction_epistasis.py
diff --git a/scripts/data_extraction_epistasis.py b/scripts/data_extraction_epistasis.py
new file mode 100755
index 0000000..6626338
--- /dev/null
+++ b/scripts/data_extraction_epistasis.py
@@ -0,0 +1,304 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+'''
+Created on Tue Aug  6 12:56:03 2019
+
+@author: tanu
+'''
+
+# FIXME: include error checking to enure you only
+# concentrate on positions that have structural info?
+
+# FIXME: import dirs.py to get the basic dir paths available
+#=======================================================================
+# TASK: extract ALL <gene> matched mutations from GWAS data
+# Input data file has the following format: each row = unique sample id 
+# id,country,lineage,sublineage,drtype,drug,dr_muts_col,other_muts_col...
+# 0,sampleID,USA,lineage2,lineage2.2.1,Drug-resistant,0.0,WT,gene_match<wt>POS<mut>; pncA_c.<wt>POS<mut>...
+# where multiple mutations and multiple mutation types are separated by ';'. 
+# We are interested in the protein coding region i.e mutation with the<gene>_'p.' format.
+# This script splits the mutations on the ';' and extracts protein coding muts only
+# where each row is a separate mutation
+# sample ids AND mutations are NOT unique, but the COMBINATION (sample id + mutation) = unique
+
+# NOTE
+#drtype is renamed to 'resistance' in the 35k dataset
+
+# output files: all lower case
+# 0) <gene>_common_ids.csv
+# 1) <gene>_ambiguous_muts.csv
+# 2) <gene>_mcsm_snps.csv
+# 3) <gene>_metadata.csv
+# 4) <gene>_all_muts_msa.csv
+# 5) <gene>_mutational_positons.csv
+
+# FIXME
+## Make all cols lowercase
+## change WildPos: wild_pos
+## Add an extra col: wild_chain_pos
+## output df: <gene>_linking_df.csv
+#containing the following cols
+#1. Mutationinformation
+#2. wild_type
+#3. position
+#4. mutant_type
+#5. chain
+#6. wild_pos
+#7. wild_chain_pos
+#=======================================================================
+#%% load libraries
+import os, sys
+import re
+import pandas as pd
+import numpy as np
+import argparse
+#=======================================================================
+#%% homdir and curr dir and local imports
+homedir = os.path.expanduser('~') 
+# set working dir
+os.getcwd()
+os.chdir(homedir + '/git/LSHTM_analysis/scripts')
+os.getcwd()
+
+# import aa dict
+#from reference_dict import my_aa_dict # CHECK DIR STRUC THERE!
+#from tidy_split import tidy_split
+#=======================================================================
+#%% command line args
+arg_parser = argparse.ArgumentParser()
+arg_parser.add_argument('-d', '--drug', help='drug name (case sensitive)', default = None)
+arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = None)
+
+args = arg_parser.parse_args()
+#=======================================================================
+#%% variable assignment: input and output paths & filenames
+drug = args.drug
+gene = args.gene
+
+#drug = 'pyrazinamide'
+#gene = 'pncA'
+
+gene_match = gene + '_p.'
+print('mut pattern for gene', gene, ':',  gene_match)
+
+nssnp_match = gene_match +'[A-Za-z]{3}[0-9]+[A-Za-z]{3}'
+print('nsSNP for gene', gene, ':',  nssnp_match)
+
+wt_regex = gene_match.lower()+'([A-Za-z]{3})'
+print('wt regex:', wt_regex)
+
+mut_regex = r'[0-9]+(\w{3})$'
+print('mt regex:', mut_regex)
+
+pos_regex = r'([0-9]+)'
+print('position regex:', pos_regex)
+
+# building cols to extract
+dr_muts_col = 'dr_mutations_' + drug
+other_muts_col = 'other_mutations_' + drug
+resistance_col = 'drtype'
+
+print('Extracting columns based on variables:\n'
+	, drug
+	, '\n'
+	, dr_muts_col
+	, '\n'
+	, other_muts_col
+    , '\n'
+    , resistance_col
+	, '\n===============================================================')
+#=======================================================================
+#%% input and output dirs and files
+#=======
+# dirs
+#=======
+datadir = homedir + '/' + 'git/Data'
+indir = datadir + '/' + drug + '/' + 'input'
+outdir = datadir + '/' + drug + '/' + 'output'
+
+#=======
+# input
+#=======
+#in_filename_master_master  = 'original_tanushree_data_v2.csv' #19k
+in_filename_master  = 'mtb_gwas_meta_v6.csv' #35k
+infile_master = datadir + '/' + in_filename_master
+print('Input file: ', infile_master
+      , '\n============================================================')
+
+#=======
+# output
+#=======
+out_filename_epistasis  = gene.lower() + '_epistasis_muts.csv'
+outfile_epistasis  = outdir + '/' + out_filename_epistasis 
+print('Output file: ', outfile_epistasis
+      , '\n============================================================')
+
+out_filename_epistasis_check  = gene.lower() + '_epistasis_muts_check.csv'
+outfile_epistasis_check  = outdir + '/' + out_filename_epistasis_check  
+print('Output file: ', outfile_epistasis_check
+      , '\n============================================================')
+
+
+#%%end of variable assignment for input and output files
+#=======================================================================
+#%% Read input file
+master_data  = pd.read_csv(infile_master, sep = ',')  
+
+# column names
+#list(master_data.columns)
+
+# extract elevant columns to extract from meta data related to the drug
+if in_filename_master == 'original_tanushree_data_v2.csv':
+    meta_data = master_data[['id'
+                             , 'country'
+                             , 'lineage'
+                             , 'sublineage'
+                             , 'drtype'
+                             , drug
+                             , dr_muts_col
+                             , other_muts_col]]
+else:
+    core_cols = ['id'
+                 , 'sample'
+
+                 , 'lineage'
+                 , 'sublineage' 
+  
+                 , 'country_code'
+                 , 'geographic_source'
+
+                 ,  resistance_col]
+    
+    variable_based_cols = [drug
+                           , dr_muts_col
+                           , other_muts_col]
+       
+    cols_to_extract = core_cols + variable_based_cols
+    print('Extracting', len(cols_to_extract), 'columns from master data')
+
+    meta_data = master_data[cols_to_extract]  
+    del(master_data, variable_based_cols, cols_to_extract)
+
+print('Extracted meta data from filename:', in_filename_master
+      , '\nDim:', meta_data.shape)
+
+# checks and results
+total_samples = meta_data['id'].nunique() 
+print('RESULT: Total samples:', total_samples
+		, '\n===========================================================')
+
+# counts NA per column
+meta_data.isna().sum()
+print('No. of NAs/column:' + '\n', meta_data.isna().sum()
+		, '\n===========================================================')
+#%%
+# shorter df
+cols_epi = ['id'
+            , 'sample'
+            , dr_muts_col
+            , other_muts_col]
+
+meta_data_epi = meta_data[cols_epi]
+
+# extract entries with semi colon
+multi_match = ';'
+meta_multi = meta_data_epi.loc[meta_data_epi[dr_muts_col].str.contains(multi_match , na = False, regex = True, case = False) | meta_data_epi[other_muts_col].str.contains(multi_match , na = False, regex = True, case = False) ]
+meta_gene_multi = meta_multi.loc[meta_multi[dr_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) | meta_multi[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) ]
+
+#%%
+# count no. of nssnp_match: dr_muts_col
+meta_gene_multi['dr_mult_snp_count'] = meta_gene_multi [dr_muts_col].str.count(nssnp_match, re.I)
+
+# count no. of nssnp_match: other_muts_col
+meta_gene_multi['other_mult_snp_count'] = meta_gene_multi [other_muts_col].str.count(nssnp_match, re.I)
+
+# check condition
+(meta_gene_multi['dr_mult_snp_count']>1) | (meta_gene_multi['other_mult_snp_count']>1) == True
+meta_gene_epi = meta_gene_multi.loc[(meta_gene_multi['dr_mult_snp_count']>1) | (meta_gene_multi['other_mult_snp_count']>1) == True]
+
+
+#%% TEST
+# formatting, replace !nssnp_match  with nothing
+foo1 = 	'pncA_p.Thr47Pro;pncA_p.Thr61Pro;rpsA_c.XX'
+foo2 = 'pncA_Chromosome:g.2288693_2289280del; WT; pncA_p.Thr61Ala'
+
+
+foo1_s = foo1.split(';')
+foo1_s
+nssnp_match2 = re.compile('(pncA_p.[A-Za-z]{3}[0-9]+[A-Za-z]{3})')
+arse=list(filter(nssnp_match2.match, foo1_s))
+arse
+
+foo1_s2 = ';'.join(arse)
+foo1_s2
+#%%
+nssnp_match2 = re.compile('(pncA_p.[A-Za-z]{3}[0-9]+[A-Za-z]{3})')
+
+# dr_muts_col
+dr_clean_col = dr_muts_col + '_clean'
+
+#meta_gene_epi[dr_clean_col] = meta_gene_epi[dr_muts_col].str.split(';')
+meta_gene_epi[dr_clean_col] = ''
+
+for i, v in enumerate(meta_gene_epi[dr_muts_col]):
+    #print(i, v)
+    print('======================================================')
+    print(i)
+    print(v)
+    dr2_s = v.split(';')
+    print(dr2_s)
+    dr2_sf = list(filter(nssnp_match2.match, dr2_s))
+    print(dr2_sf)
+    dr2_sf2  = ';'.join(dr2_sf)
+    meta_gene_epi[dr_clean_col].iloc[i] = dr2_sf2
+
+del(i, v)
+#%%
+# other_muts_col
+other_clean_col = other_muts_col + '_clean'
+
+#meta_gene_epi[other_clean_col] = meta_gene_epi[other_muts_col].str.split(';')
+meta_gene_epi[other_clean_col] = ''
+
+for i, v in enumerate(meta_gene_epi[other_muts_col]):
+    #print(i, v)
+    print('======================================================')
+    print(i)
+    print(v)
+    other2_s = v.split(';')
+    print(other2_s)
+    other2_sf = list(filter(nssnp_match2.match,  other2_s))
+    print(other2_sf)
+    other2_sf2  = ';'.join(other2_sf)
+    meta_gene_epi[other_clean_col].iloc[i] =  other2_sf2
+
+
+#%%
+# rearange columns
+
+meta_gene_epi_f = meta_gene_epi[['id', 'sample'
+                               , dr_muts_col, dr_clean_col
+                               , 'dr_mult_snp_count'
+                               , other_muts_col, other_clean_col
+                               , 'other_mult_snp_count']]
+meta_gene_epi_f.columns
+
+cols_to_output = ['id', 'sample'
+                   , dr_clean_col
+                  # , 'dr_mult_snp_count'
+                   , other_clean_col
+                  # , 'other_mult_snp_count'
+                  ]
+                  
+meta_gene_epi_f2 = meta_gene_epi_f[cols_to_output]
+
+
+
+#%%
+# formatting, replace !nssnp_match  with nothing
+#nssnp_neg_match = '(?!pncA_p.[A-Za-z]{3}[0-9]+[A-Za-z]{3})'
+
+#%% end of data extraction. Write files
+meta_gene_epi_f.to_csv(outfile_epistasis_check, index = True)
+meta_gene_epi_f2.to_csv(outfile_epistasis, index = False)
+