152 lines
5.4 KiB
Python
Executable file
152 lines
5.4 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
Created on Wed Jun 10 11:13:49 2020
|
|
|
|
@author: tanu
|
|
"""
|
|
#==============================================================================
|
|
# TASK
|
|
# To format snp_fino.txt file that has already been processed in bash
|
|
# to include mcsm style muts and gwas style muts. The idea is that the info file
|
|
# will contain all possible mutation format style to make it easy to populate
|
|
# and link other files with this sort of meta data. For example: or file
|
|
#=======================================================================
|
|
|
|
# FIXME : add bash info here as well
|
|
|
|
#%% useful links
|
|
#https://chrisalbon.com/python/data_wrangling/pandas_join_merge_dataframe/
|
|
#https://kanoki.org/2019/11/12/how-to-use-regex-in-pandas/
|
|
#https://stackoverflow.com/questions/40348541/pandas-diff-with-string
|
|
#=======================================================================
|
|
#%% specify dirs
|
|
import os, sys
|
|
import pandas as pd
|
|
import numpy as np
|
|
import re
|
|
import argparse
|
|
|
|
homedir = os.path.expanduser('~')
|
|
os.chdir(homedir + '/git/LSHTM_analysis/scripts')
|
|
|
|
#from reference_dict import my_aa_dict
|
|
#from reference_dict import low_3letter_dict # equivalent of my_aa_dict
|
|
from reference_dict import oneletter_aa_dict
|
|
#=======================================================================
|
|
#%% command line args
|
|
arg_parser = argparse.ArgumentParser()
|
|
|
|
arg_parser.add_argument('-d', '--drug', help = 'drug name', default = 'pyrazinamide')
|
|
arg_parser.add_argument('-g', '--gene', help = 'gene name (case sensitive)', default = 'pncA') # case sensitive
|
|
|
|
args = arg_parser.parse_args()
|
|
#=======================================================================
|
|
#%% variables
|
|
#gene = 'pncA'
|
|
#drug = 'pyrazinamide'
|
|
#gene_match = gene +'_p.'
|
|
|
|
# cmd variables
|
|
gene = args.gene
|
|
drug = args.drug
|
|
gene_match = gene +'_p.'
|
|
|
|
#=======================================================================
|
|
#%% input and output dirs and files
|
|
#=======
|
|
# data dir
|
|
#=======
|
|
datadir = homedir + '/' + 'git/Data'
|
|
indir = datadir + '/' + drug + '/input'
|
|
outdir = datadir + '/' + drug + '/output'
|
|
|
|
#=======
|
|
# input
|
|
#=======
|
|
|
|
gene_info_filename = 'ns'+ gene.lower()+ '_snp_info1.txt'
|
|
gene_info = indir + '/' + gene_info_filename
|
|
print('gene info file: ', gene_info
|
|
, '\n============================================================')
|
|
|
|
#=======
|
|
# output
|
|
#=======
|
|
gene_snp_info_filename = 'ns' + gene.lower() + '_snp_info.csv' # other one is called AFandOR
|
|
outfile_snp_info = indir + '/' + gene_snp_info_filename
|
|
print('Output file: ', outfile_snp_info
|
|
, '\n============================================================')
|
|
|
|
#%% read files: preformatted using bash
|
|
info_df2 = pd.read_csv(gene_info, sep = '\t', header = 0) #303, 10
|
|
|
|
#%% Split into three cols with 1-letter aa_code & combine to get mutationinformation column
|
|
# check mutation format in exisiting df
|
|
info_df2.head()
|
|
info_df2['mut_info'].head()
|
|
|
|
print('Creating column: mutationinformation')
|
|
info_df2_ncols = len(info_df2.columns)
|
|
|
|
info_df2['wild_type'] = info_df2['mut_info'].str.extract('(\w{1})>')
|
|
info_df2['position'] = info_df2['mut_info'].str.extract('(\d+)')
|
|
info_df2['mutant_type'] = info_df2['mut_info'].str.extract('>\d+(\w{1})')
|
|
|
|
info_df2['mutationinformation'] = info_df2['wild_type'] + info_df2['position'] + info_df2['mutant_type']
|
|
|
|
# sanity check
|
|
ncols_add = 4 # Beware hardcoded
|
|
if len(info_df2.columns) == info_df2_ncols + ncols_add:
|
|
print('PASS: Succesfully extracted and added mutationinformation (mcsm style) as below\n'
|
|
, info_df2['mutationinformation'].head()
|
|
, '\n=====================================================================')
|
|
else:
|
|
print('FAIL: No. of cols mismatch'
|
|
,'\noriginal length:', info_df2_ncols
|
|
, '\nExpected no. of cols:', info_df2_ncols + ncols_add
|
|
, '\nGot no. of cols:', len(info_df2.columns))
|
|
sys.exit()
|
|
|
|
# update ncols
|
|
info_df2_ncols = len(info_df2.columns)
|
|
|
|
#%% Creating column 'mutation' which as mutation of the format;
|
|
# <gene_match>.lower()<abc>1<cde>: pnca_p.trp68gly
|
|
# match the 'one_letter_code' value to get the dict key (three-letter code)
|
|
|
|
print('Creating column: mutation')
|
|
|
|
# dict to use: oneletter_aa_dict
|
|
lookup_dict = dict()
|
|
for k1, v1 in oneletter_aa_dict.items():
|
|
lookup_dict[k1] = v1['three_letter_code_lower']
|
|
for k2, v2 in lookup_dict.items():
|
|
info_df2['wt_3let'] = info_df2['wild_type'].squeeze().map(lookup_dict)
|
|
info_df2['mt_3let'] = info_df2['mutant_type'].squeeze().map(lookup_dict)
|
|
|
|
# create column mutation
|
|
info_df2['mutation'] = info_df2['wt_3let'] + info_df2['position'] + info_df2['mt_3let']
|
|
|
|
# add prefix: gene_match to each value in column
|
|
info_df2['mutation'] = gene_match.lower() + info_df2['mutation'].astype(str)
|
|
|
|
# sanity check
|
|
ncols_add = 3 # Beware hardcoded
|
|
if len(info_df2.columns) == info_df2_ncols + ncols_add:
|
|
print('PASS: Succesfully created column mutation as below\n'
|
|
, info_df2['mutation'].head()
|
|
, '\n=====================================================================')
|
|
else:
|
|
print('FAIL: No. of cols mismatch\noriginal length:', info_df2_ncols
|
|
, '\nExpected no. of cols:', info_df2_ncols + ncols_add
|
|
, '\nGot no. of cols:', len(info_df2.columns))
|
|
sys.exit()
|
|
|
|
#%% write file
|
|
print('\n====================================================================='
|
|
, '\nWriting output file:\n', outfile_snp_info
|
|
, '\nNo.of rows:', len(info_df2)
|
|
, '\nNo. of cols:', len(info_df2.columns))
|
|
info_df2.to_csv(outfile_snp_info, index = False)
|
|
|