#!/usr/bin/env python3 # -*- coding: utf-8 -*- ''' Created on Tue Aug 6 12:56:03 2019 @author: tanu ''' # FIXME: include error checking to enure you only # concentrate on positions that have structural info? # FIXME: import dirs.py to get the basic dir paths available #======================================================================= # TASK: extract ALL matched mutations from GWAS data # Input data file has the following format: each row = unique sample id # id,country,lineage,sublineage,drtype,drug,dr_muts_col,other_muts_col... # 0,sampleID,USA,lineage2,lineage2.2.1,Drug-resistant,0.0,WT,gene_matchPOS; pncA_c.POS... # where multiple mutations and multiple mutation types are separated by ';'. # We are interested in the protein coding region i.e mutation with the_'p.' format. # This script splits the mutations on the ';' and extracts protein coding muts only # where each row is a separate mutation # sample ids AND mutations are NOT unique, but the COMBINATION (sample id + mutation) = unique # NOTE #drtype is renamed to 'resistance' in the 35k dataset # output files: all lower case # 0) _common_ids.csv # 1) _ambiguous_muts.csv # 2) _mcsm_snps.csv # 3) _metadata.csv # 4) _all_muts_msa.csv # 5) _mutational_positons.csv # FIXME ## Make all cols lowercase ## change WildPos: wild_pos ## Add an extra col: wild_chain_pos ## output df: _linking_df.csv #containing the following cols #1. Mutationinformation #2. wild_type #3. position #4. mutant_type #5. chain #6. wild_pos #7. wild_chain_pos #======================================================================= #%% load libraries import os, sys import re import pandas as pd import numpy as np import argparse #======================================================================= #%% homdir and curr dir and local imports homedir = os.path.expanduser('~') # set working dir os.getcwd() os.chdir(homedir + '/git/LSHTM_analysis/scripts') os.getcwd() # import aa dict #from reference_dict import my_aa_dict # CHECK DIR STRUC THERE! #from tidy_split import tidy_split #======================================================================= #%% command line args arg_parser = argparse.ArgumentParser() arg_parser.add_argument('-d', '--drug', help='drug name (case sensitive)', default = None) arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = None) args = arg_parser.parse_args() #======================================================================= #%% variable assignment: input and output paths & filenames drug = args.drug gene = args.gene gene_match = gene + '_p.' print('mut pattern for gene', gene, ':', gene_match) nssnp_match = gene_match +'[A-Za-z]{3}[0-9]+[A-Za-z]{3}' print('nsSNP for gene', gene, ':', nssnp_match) nssnp_match2 = re.compile(nssnp_match) wt_regex = gene_match.lower()+'([A-Za-z]{3})' print('wt regex:', wt_regex) mut_regex = r'[0-9]+(\w{3})$' print('mt regex:', mut_regex) pos_regex = r'([0-9]+)' print('position regex:', pos_regex) # building cols to extract dr_muts_col = 'dr_mutations_' + drug other_muts_col = 'other_mutations_' + drug resistance_col = 'drtype' print('Extracting columns based on variables:\n' , drug , '\n' , dr_muts_col , '\n' , other_muts_col , '\n' , resistance_col , '\n===============================================================') #======================================================================= #%% input and output dirs and files #======= # dirs #======= datadir = homedir + '/' + 'git/Data' indir = datadir + '/' + drug + '/' + 'input' outdir = datadir + '/' + drug + '/' + 'output' #======= # input #======= #in_filename_master_master = 'original_tanushree_data_v2.csv' #19k in_filename_master = 'mtb_gwas_meta_v6.csv' #35k infile_master = datadir + '/' + in_filename_master print('Input file: ', infile_master , '\n============================================================') #======= # output #======= out_filename_epistasis = gene.lower() + '_epistasis_muts.csv' outfile_epistasis = outdir + '/' + out_filename_epistasis print('Output file: ', outfile_epistasis , '\n============================================================') out_filename_epistasis_check = gene.lower() + '_epistasis_muts_check.csv' outfile_epistasis_check = outdir + '/' + out_filename_epistasis_check print('Output file: ', outfile_epistasis_check , '\n============================================================') #%%end of variable assignment for input and output files #======================================================================= #%% Read input file master_data = pd.read_csv(infile_master, sep = ',') # column names #list(master_data.columns) # extract elevant columns to extract from meta data related to the drug if in_filename_master == 'original_tanushree_data_v2.csv': meta_data = master_data[['id' , 'country' , 'lineage' , 'sublineage' , 'drtype' , drug , dr_muts_col , other_muts_col]] else: core_cols = ['id' , 'sample' , 'lineage' , 'sublineage' , 'country_code' , 'geographic_source' , resistance_col] variable_based_cols = [drug , dr_muts_col , other_muts_col] cols_to_extract = core_cols + variable_based_cols print('Extracting', len(cols_to_extract), 'columns from master data') meta_data = master_data[cols_to_extract] del(master_data, variable_based_cols, cols_to_extract) print('Extracted meta data from filename:', in_filename_master , '\nDim:', meta_data.shape) # checks and results total_samples = meta_data['id'].nunique() print('RESULT: Total samples:', total_samples , '\n===========================================================') # counts NA per column meta_data.isna().sum() print('No. of NAs/column:' + '\n', meta_data.isna().sum() , '\n===========================================================') #%% # shorter df cols_epi = ['id' , 'sample' , dr_muts_col , other_muts_col] meta_data_epi = meta_data[cols_epi] # extract entries with semi colon multi_match = ';' meta_multi = meta_data_epi.loc[meta_data_epi[dr_muts_col].str.contains(multi_match , na = False, regex = True, case = False) | meta_data_epi[other_muts_col].str.contains(multi_match , na = False, regex = True, case = False) ] meta_gene_multi = meta_multi.loc[meta_multi[dr_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) | meta_multi[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) ] #%% # count no. of nssnp_match: dr_muts_col meta_gene_multi['dr_mult_snp_count'] = meta_gene_multi [dr_muts_col].str.count(nssnp_match, re.I) # count no. of nssnp_match: other_muts_col meta_gene_multi['other_mult_snp_count'] = meta_gene_multi [other_muts_col].str.count(nssnp_match, re.I) # check condition (meta_gene_multi['dr_mult_snp_count']>1) | (meta_gene_multi['other_mult_snp_count']>1) == True meta_gene_epi = meta_gene_multi.loc[(meta_gene_multi['dr_mult_snp_count']>1) | (meta_gene_multi['other_mult_snp_count']>1) == True] #%% TEST # formatting, replace !nssnp_match with nothing #foo1 = 'pncA_p.Thr47Pro;pncA_p.Thr61Pro;rpsA_c.XX' #foo2 = 'pncA_Chromosome:g.2288693_2289280del; WT; pncA_p.Thr61Ala' #foo1_s = foo1.split(';') #foo1_s #nssnp_match2 = re.compile('(pncA_p.[A-Za-z]{3}[0-9]+[A-Za-z]{3})') #arse=list(filter(nssnp_match2.match, foo1_s)) #arse #foo1_s2 = ';'.join(arse) #foo1_s2 #%% #nssnp_match2 = re.compile('(pncA_p.[A-Za-z]{3}[0-9]+[A-Za-z]{3})') # dr_muts_col dr_clean_col = dr_muts_col + '_clean' #meta_gene_epi[dr_clean_col] = meta_gene_epi[dr_muts_col].str.split(';') meta_gene_epi[dr_clean_col] = '' for i, v in enumerate(meta_gene_epi[dr_muts_col]): #print(i, v) print('======================================================') print(i) print(v) dr2_s = v.split(';') print(dr2_s) dr2_sf = list(filter(nssnp_match2.match, dr2_s)) #dr2_sf = list(filter(nssnp_match.match, dr2_s)) print(dr2_sf) dr2_sf2 = ';'.join(dr2_sf) meta_gene_epi[dr_clean_col].iloc[i] = dr2_sf2 del(i, v) #%% # other_muts_col other_clean_col = other_muts_col + '_clean' #meta_gene_epi[other_clean_col] = meta_gene_epi[other_muts_col].str.split(';') meta_gene_epi[other_clean_col] = '' for i, v in enumerate(meta_gene_epi[other_muts_col]): #print(i, v) #print('======================================================') #print(i) #print(v) other2_s = v.split(';') #print(other2_s) other2_sf = list(filter(nssnp_match2.match, other2_s)) #print(other2_sf) other2_sf2 = ';'.join(other2_sf) meta_gene_epi[other_clean_col].iloc[i] = other2_sf2 #%% # rearange columns meta_gene_epi_f = meta_gene_epi[['id', 'sample' , dr_muts_col, dr_clean_col , 'dr_mult_snp_count' , other_muts_col, other_clean_col , 'other_mult_snp_count']] #print(meta_gene_epi_f.columns) print(meta_gene_epi_f) cols_to_output = ['id', 'sample' , dr_clean_col # , 'dr_mult_snp_count' , other_clean_col # , 'other_mult_snp_count' ] meta_gene_epi_f2 = meta_gene_epi_f[cols_to_output] #%% # formatting, replace !nssnp_match with nothing #nssnp_neg_match = '(?!pncA_p.[A-Za-z]{3}[0-9]+[A-Za-z]{3})' #%% end of data extraction. Write files meta_gene_epi_f.to_csv(outfile_epistasis_check, index = True) meta_gene_epi_f2.to_csv(outfile_epistasis, index = False)