#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Wed Jun 10 11:13:49 2020 @author: tanu """ #============================================================================== # TASK # To format snp_fino.txt file that has already been processed in bash # to include mcsm style muts and gwas style muts. The idea is that the info file # will contain all possible mutation format style to make it easy to populate # and link other files with this sort of meta data. For example: or file #======================================================================= # FIXME : add bash info here as well #%% useful links #https://chrisalbon.com/python/data_wrangling/pandas_join_merge_dataframe/ #https://kanoki.org/2019/11/12/how-to-use-regex-in-pandas/ #https://stackoverflow.com/questions/40348541/pandas-diff-with-string #======================================================================= #%% specify dirs import os, sys import pandas as pd import numpy as np import re import argparse homedir = os.path.expanduser('~') os.chdir(homedir + '/git/LSHTM_analysis/scripts') #from reference_dict import my_aa_dict #from reference_dict import low_3letter_dict # equivalent of my_aa_dict from reference_dict import oneletter_aa_dict #======================================================================= #%% command line args arg_parser = argparse.ArgumentParser() arg_parser.add_argument('-d', '--drug', help = 'drug name', default = 'pyrazinamide') arg_parser.add_argument('-g', '--gene', help = 'gene name (case sensitive)', default = 'pncA') # case sensitive args = arg_parser.parse_args() #======================================================================= #%% variables #gene = 'pncA' #drug = 'pyrazinamide' #gene_match = gene +'_p.' # cmd variables gene = args.gene drug = args.drug gene_match = gene +'_p.' #======================================================================= #%% input and output dirs and files #======= # data dir #======= datadir = homedir + '/' + 'git/Data' indir = datadir + '/' + drug + '/input' outdir = datadir + '/' + drug + '/output' #======= # input #======= gene_info_filename = 'ns'+ gene.lower()+ '_snp_info1.txt' gene_info = indir + '/' + gene_info_filename print('gene info file: ', gene_info , '\n============================================================') #======= # output #======= gene_snp_info_filename = 'ns' + gene.lower() + '_snp_info.csv' # other one is called AFandOR outfile_snp_info = indir + '/' + gene_snp_info_filename print('Output file: ', outfile_snp_info , '\n============================================================') #%% read files: preformatted using bash info_df2 = pd.read_csv(gene_info, sep = '\t', header = 0) #303, 10 #%% Split into three cols with 1-letter aa_code & combine to get mutationinformation column # check mutation format in exisiting df info_df2.head() info_df2['mut_info'].head() print('Creating column: mutationinformation') info_df2_ncols = len(info_df2.columns) info_df2['wild_type'] = info_df2['mut_info'].str.extract('(\w{1})>') info_df2['position'] = info_df2['mut_info'].str.extract('(\d+)') info_df2['mutant_type'] = info_df2['mut_info'].str.extract('>\d+(\w{1})') info_df2['mutationinformation'] = info_df2['wild_type'] + info_df2['position'] + info_df2['mutant_type'] # sanity check ncols_add = 4 # Beware hardcoded if len(info_df2.columns) == info_df2_ncols + ncols_add: print('PASS: Succesfully extracted and added mutationinformation (mcsm style) as below\n' , info_df2['mutationinformation'].head() , '\n=====================================================================') else: print('FAIL: No. of cols mismatch' ,'\noriginal length:', info_df2_ncols , '\nExpected no. of cols:', info_df2_ncols + ncols_add , '\nGot no. of cols:', len(info_df2.columns)) sys.exit() # update ncols info_df2_ncols = len(info_df2.columns) #%% Creating column 'mutation' which as mutation of the format; # .lower()1: pnca_p.trp68gly # match the 'one_letter_code' value to get the dict key (three-letter code) print('Creating column: mutation') # dict to use: oneletter_aa_dict lookup_dict = dict() for k1, v1 in oneletter_aa_dict.items(): lookup_dict[k1] = v1['three_letter_code_lower'] for k2, v2 in lookup_dict.items(): info_df2['wt_3let'] = info_df2['wild_type'].squeeze().map(lookup_dict) info_df2['mt_3let'] = info_df2['mutant_type'].squeeze().map(lookup_dict) # create column mutation info_df2['mutation'] = info_df2['wt_3let'] + info_df2['position'] + info_df2['mt_3let'] # add prefix: gene_match to each value in column info_df2['mutation'] = gene_match.lower() + info_df2['mutation'].astype(str) # sanity check ncols_add = 3 # Beware hardcoded if len(info_df2.columns) == info_df2_ncols + ncols_add: print('PASS: Succesfully created column mutation as below\n' , info_df2['mutation'].head() , '\n=====================================================================') else: print('FAIL: No. of cols mismatch\noriginal length:', info_df2_ncols , '\nExpected no. of cols:', info_df2_ncols + ncols_add , '\nGot no. of cols:', len(info_df2.columns)) sys.exit() #%% write file print('\n=====================================================================' , '\nWriting output file:\n', outfile_snp_info , '\nNo.of rows:', len(info_df2) , '\nNo. of cols:', len(info_df2.columns)) info_df2.to_csv(outfile_snp_info, index = False)