From 8e16b2635eb9b72f5f6e0e968d34092d925c9966 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Tue, 29 Sep 2020 16:09:54 +0100 Subject: [PATCH] added ../data_extraction_epistasis.py for getting list for epistasis work --- scripts/data_extraction_epistasis.py | 304 +++++++++++++++++++++++++++ 1 file changed, 304 insertions(+) create mode 100755 scripts/data_extraction_epistasis.py diff --git a/scripts/data_extraction_epistasis.py b/scripts/data_extraction_epistasis.py new file mode 100755 index 0000000..6626338 --- /dev/null +++ b/scripts/data_extraction_epistasis.py @@ -0,0 +1,304 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +''' +Created on Tue Aug 6 12:56:03 2019 + +@author: tanu +''' + +# FIXME: include error checking to enure you only +# concentrate on positions that have structural info? + +# FIXME: import dirs.py to get the basic dir paths available +#======================================================================= +# TASK: extract ALL matched mutations from GWAS data +# Input data file has the following format: each row = unique sample id +# id,country,lineage,sublineage,drtype,drug,dr_muts_col,other_muts_col... +# 0,sampleID,USA,lineage2,lineage2.2.1,Drug-resistant,0.0,WT,gene_matchPOS; pncA_c.POS... +# where multiple mutations and multiple mutation types are separated by ';'. +# We are interested in the protein coding region i.e mutation with the_'p.' format. +# This script splits the mutations on the ';' and extracts protein coding muts only +# where each row is a separate mutation +# sample ids AND mutations are NOT unique, but the COMBINATION (sample id + mutation) = unique + +# NOTE +#drtype is renamed to 'resistance' in the 35k dataset + +# output files: all lower case +# 0) _common_ids.csv +# 1) _ambiguous_muts.csv +# 2) _mcsm_snps.csv +# 3) _metadata.csv +# 4) _all_muts_msa.csv +# 5) _mutational_positons.csv + +# FIXME +## Make all cols lowercase +## change WildPos: wild_pos +## Add an extra col: wild_chain_pos +## output df: _linking_df.csv +#containing the following cols +#1. Mutationinformation +#2. wild_type +#3. position +#4. mutant_type +#5. chain +#6. wild_pos +#7. wild_chain_pos +#======================================================================= +#%% load libraries +import os, sys +import re +import pandas as pd +import numpy as np +import argparse +#======================================================================= +#%% homdir and curr dir and local imports +homedir = os.path.expanduser('~') +# set working dir +os.getcwd() +os.chdir(homedir + '/git/LSHTM_analysis/scripts') +os.getcwd() + +# import aa dict +#from reference_dict import my_aa_dict # CHECK DIR STRUC THERE! +#from tidy_split import tidy_split +#======================================================================= +#%% command line args +arg_parser = argparse.ArgumentParser() +arg_parser.add_argument('-d', '--drug', help='drug name (case sensitive)', default = None) +arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = None) + +args = arg_parser.parse_args() +#======================================================================= +#%% variable assignment: input and output paths & filenames +drug = args.drug +gene = args.gene + +#drug = 'pyrazinamide' +#gene = 'pncA' + +gene_match = gene + '_p.' +print('mut pattern for gene', gene, ':', gene_match) + +nssnp_match = gene_match +'[A-Za-z]{3}[0-9]+[A-Za-z]{3}' +print('nsSNP for gene', gene, ':', nssnp_match) + +wt_regex = gene_match.lower()+'([A-Za-z]{3})' +print('wt regex:', wt_regex) + +mut_regex = r'[0-9]+(\w{3})$' +print('mt regex:', mut_regex) + +pos_regex = r'([0-9]+)' +print('position regex:', pos_regex) + +# building cols to extract +dr_muts_col = 'dr_mutations_' + drug +other_muts_col = 'other_mutations_' + drug +resistance_col = 'drtype' + +print('Extracting columns based on variables:\n' + , drug + , '\n' + , dr_muts_col + , '\n' + , other_muts_col + , '\n' + , resistance_col + , '\n===============================================================') +#======================================================================= +#%% input and output dirs and files +#======= +# dirs +#======= +datadir = homedir + '/' + 'git/Data' +indir = datadir + '/' + drug + '/' + 'input' +outdir = datadir + '/' + drug + '/' + 'output' + +#======= +# input +#======= +#in_filename_master_master = 'original_tanushree_data_v2.csv' #19k +in_filename_master = 'mtb_gwas_meta_v6.csv' #35k +infile_master = datadir + '/' + in_filename_master +print('Input file: ', infile_master + , '\n============================================================') + +#======= +# output +#======= +out_filename_epistasis = gene.lower() + '_epistasis_muts.csv' +outfile_epistasis = outdir + '/' + out_filename_epistasis +print('Output file: ', outfile_epistasis + , '\n============================================================') + +out_filename_epistasis_check = gene.lower() + '_epistasis_muts_check.csv' +outfile_epistasis_check = outdir + '/' + out_filename_epistasis_check +print('Output file: ', outfile_epistasis_check + , '\n============================================================') + + +#%%end of variable assignment for input and output files +#======================================================================= +#%% Read input file +master_data = pd.read_csv(infile_master, sep = ',') + +# column names +#list(master_data.columns) + +# extract elevant columns to extract from meta data related to the drug +if in_filename_master == 'original_tanushree_data_v2.csv': + meta_data = master_data[['id' + , 'country' + , 'lineage' + , 'sublineage' + , 'drtype' + , drug + , dr_muts_col + , other_muts_col]] +else: + core_cols = ['id' + , 'sample' + + , 'lineage' + , 'sublineage' + + , 'country_code' + , 'geographic_source' + + , resistance_col] + + variable_based_cols = [drug + , dr_muts_col + , other_muts_col] + + cols_to_extract = core_cols + variable_based_cols + print('Extracting', len(cols_to_extract), 'columns from master data') + + meta_data = master_data[cols_to_extract] + del(master_data, variable_based_cols, cols_to_extract) + +print('Extracted meta data from filename:', in_filename_master + , '\nDim:', meta_data.shape) + +# checks and results +total_samples = meta_data['id'].nunique() +print('RESULT: Total samples:', total_samples + , '\n===========================================================') + +# counts NA per column +meta_data.isna().sum() +print('No. of NAs/column:' + '\n', meta_data.isna().sum() + , '\n===========================================================') +#%% +# shorter df +cols_epi = ['id' + , 'sample' + , dr_muts_col + , other_muts_col] + +meta_data_epi = meta_data[cols_epi] + +# extract entries with semi colon +multi_match = ';' +meta_multi = meta_data_epi.loc[meta_data_epi[dr_muts_col].str.contains(multi_match , na = False, regex = True, case = False) | meta_data_epi[other_muts_col].str.contains(multi_match , na = False, regex = True, case = False) ] +meta_gene_multi = meta_multi.loc[meta_multi[dr_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) | meta_multi[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) ] + +#%% +# count no. of nssnp_match: dr_muts_col +meta_gene_multi['dr_mult_snp_count'] = meta_gene_multi [dr_muts_col].str.count(nssnp_match, re.I) + +# count no. of nssnp_match: other_muts_col +meta_gene_multi['other_mult_snp_count'] = meta_gene_multi [other_muts_col].str.count(nssnp_match, re.I) + +# check condition +(meta_gene_multi['dr_mult_snp_count']>1) | (meta_gene_multi['other_mult_snp_count']>1) == True +meta_gene_epi = meta_gene_multi.loc[(meta_gene_multi['dr_mult_snp_count']>1) | (meta_gene_multi['other_mult_snp_count']>1) == True] + + +#%% TEST +# formatting, replace !nssnp_match with nothing +foo1 = 'pncA_p.Thr47Pro;pncA_p.Thr61Pro;rpsA_c.XX' +foo2 = 'pncA_Chromosome:g.2288693_2289280del; WT; pncA_p.Thr61Ala' + + +foo1_s = foo1.split(';') +foo1_s +nssnp_match2 = re.compile('(pncA_p.[A-Za-z]{3}[0-9]+[A-Za-z]{3})') +arse=list(filter(nssnp_match2.match, foo1_s)) +arse + +foo1_s2 = ';'.join(arse) +foo1_s2 +#%% +nssnp_match2 = re.compile('(pncA_p.[A-Za-z]{3}[0-9]+[A-Za-z]{3})') + +# dr_muts_col +dr_clean_col = dr_muts_col + '_clean' + +#meta_gene_epi[dr_clean_col] = meta_gene_epi[dr_muts_col].str.split(';') +meta_gene_epi[dr_clean_col] = '' + +for i, v in enumerate(meta_gene_epi[dr_muts_col]): + #print(i, v) + print('======================================================') + print(i) + print(v) + dr2_s = v.split(';') + print(dr2_s) + dr2_sf = list(filter(nssnp_match2.match, dr2_s)) + print(dr2_sf) + dr2_sf2 = ';'.join(dr2_sf) + meta_gene_epi[dr_clean_col].iloc[i] = dr2_sf2 + +del(i, v) +#%% +# other_muts_col +other_clean_col = other_muts_col + '_clean' + +#meta_gene_epi[other_clean_col] = meta_gene_epi[other_muts_col].str.split(';') +meta_gene_epi[other_clean_col] = '' + +for i, v in enumerate(meta_gene_epi[other_muts_col]): + #print(i, v) + print('======================================================') + print(i) + print(v) + other2_s = v.split(';') + print(other2_s) + other2_sf = list(filter(nssnp_match2.match, other2_s)) + print(other2_sf) + other2_sf2 = ';'.join(other2_sf) + meta_gene_epi[other_clean_col].iloc[i] = other2_sf2 + + +#%% +# rearange columns + +meta_gene_epi_f = meta_gene_epi[['id', 'sample' + , dr_muts_col, dr_clean_col + , 'dr_mult_snp_count' + , other_muts_col, other_clean_col + , 'other_mult_snp_count']] +meta_gene_epi_f.columns + +cols_to_output = ['id', 'sample' + , dr_clean_col + # , 'dr_mult_snp_count' + , other_clean_col + # , 'other_mult_snp_count' + ] + +meta_gene_epi_f2 = meta_gene_epi_f[cols_to_output] + + + +#%% +# formatting, replace !nssnp_match with nothing +#nssnp_neg_match = '(?!pncA_p.[A-Za-z]{3}[0-9]+[A-Za-z]{3})' + +#%% end of data extraction. Write files +meta_gene_epi_f.to_csv(outfile_epistasis_check, index = True) +meta_gene_epi_f2.to_csv(outfile_epistasis, index = False) +