added commonly used mutation format for missense muts in the gene_specific nssnp_info file
This commit is contained in:
parent
a298071309
commit
626ed3a57b
4 changed files with 194 additions and 568 deletions
152
scripts/nssnp_info_format.py
Executable file
152
scripts/nssnp_info_format.py
Executable file
|
@ -0,0 +1,152 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Wed Jun 10 11:13:49 2020
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
#==============================================================================
|
||||
# TASK
|
||||
# To format snp_fino.txt file that has already been processed in bash
|
||||
# to include mcsm style muts and gwas style muts. The idea is that the info file
|
||||
# will contain all possible mutation format style to make it easy to populate
|
||||
# and link other files with this sort of meta data. For example: or file
|
||||
#=======================================================================
|
||||
|
||||
# FIXME : add bash info here as well
|
||||
|
||||
#%% useful links
|
||||
#https://chrisalbon.com/python/data_wrangling/pandas_join_merge_dataframe/
|
||||
#https://kanoki.org/2019/11/12/how-to-use-regex-in-pandas/
|
||||
#https://stackoverflow.com/questions/40348541/pandas-diff-with-string
|
||||
#=======================================================================
|
||||
#%% specify dirs
|
||||
import os, sys
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import re
|
||||
import argparse
|
||||
|
||||
homedir = os.path.expanduser('~')
|
||||
os.chdir(homedir + '/git/LSHTM_analysis/scripts')
|
||||
|
||||
#from reference_dict import my_aa_dict
|
||||
#from reference_dict import low_3letter_dict # equivalent of my_aa_dict
|
||||
from reference_dict import oneletter_aa_dict
|
||||
#=======================================================================
|
||||
#%% command line args
|
||||
arg_parser = argparse.ArgumentParser()
|
||||
|
||||
arg_parser.add_argument('-d', '--drug', help = 'drug name', default = 'pyrazinamide')
|
||||
arg_parser.add_argument('-g', '--gene', help = 'gene name (case sensitive)', default = 'pncA') # case sensitive
|
||||
|
||||
args = arg_parser.parse_args()
|
||||
#=======================================================================
|
||||
#%% variables
|
||||
#gene = 'pncA'
|
||||
#drug = 'pyrazinamide'
|
||||
#gene_match = gene +'_p.'
|
||||
|
||||
# cmd variables
|
||||
gene = args.gene
|
||||
drug = args.drug
|
||||
gene_match = gene +'_p.'
|
||||
|
||||
#=======================================================================
|
||||
#%% input and output dirs and files
|
||||
#=======
|
||||
# data dir
|
||||
#=======
|
||||
datadir = homedir + '/' + 'git/Data'
|
||||
indir = datadir + '/' + drug + '/input'
|
||||
outdir = datadir + '/' + drug + '/output'
|
||||
|
||||
#=======
|
||||
# input
|
||||
#=======
|
||||
|
||||
gene_info_filename = 'ns'+ gene.lower()+ '_snp_info1.txt'
|
||||
gene_info = indir + '/' + gene_info_filename
|
||||
print('gene info file: ', gene_info
|
||||
, '\n============================================================')
|
||||
|
||||
#=======
|
||||
# output
|
||||
#=======
|
||||
gene_snp_info_filename = 'ns' + gene.lower() + '_snp_info.csv' # other one is called AFandOR
|
||||
outfile_snp_info = indir + '/' + gene_snp_info_filename
|
||||
print('Output file: ', outfile_snp_info
|
||||
, '\n============================================================')
|
||||
|
||||
#%% read files: preformatted using bash
|
||||
info_df2 = pd.read_csv(gene_info, sep = '\t', header = 0) #303, 10
|
||||
|
||||
#%% Split into three cols with 1-letter aa_code & combine to get mutationinformation column
|
||||
# check mutation format in exisiting df
|
||||
info_df2.head()
|
||||
info_df2['mut_info'].head()
|
||||
|
||||
print('Creating column: mutationinformation')
|
||||
info_df2_ncols = len(info_df2.columns)
|
||||
|
||||
info_df2['wild_type'] = info_df2['mut_info'].str.extract('(\w{1})>')
|
||||
info_df2['position'] = info_df2['mut_info'].str.extract('(\d+)')
|
||||
info_df2['mutant_type'] = info_df2['mut_info'].str.extract('>\d+(\w{1})')
|
||||
|
||||
info_df2['mutationinformation'] = info_df2['wild_type'] + info_df2['position'] + info_df2['mutant_type']
|
||||
|
||||
# sanity check
|
||||
ncols_add = 4 # Beware hardcoded
|
||||
if len(info_df2.columns) == info_df2_ncols + ncols_add:
|
||||
print('PASS: Succesfully extracted and added mutationinformation (mcsm style) as below\n'
|
||||
, info_df2['mutationinformation'].head()
|
||||
, '\n=====================================================================')
|
||||
else:
|
||||
print('FAIL: No. of cols mismatch'
|
||||
,'\noriginal length:', info_df2_ncols
|
||||
, '\nExpected no. of cols:', info_df2_ncols + ncols_add
|
||||
, '\nGot no. of cols:', len(info_df2.columns))
|
||||
sys.exit()
|
||||
|
||||
# update ncols
|
||||
info_df2_ncols = len(info_df2.columns)
|
||||
|
||||
#%% Creating column 'mutation' which as mutation of the format;
|
||||
# <gene_match>.lower()<abc>1<cde>: pnca_p.trp68gly
|
||||
# match the 'one_letter_code' value to get the dict key (three-letter code)
|
||||
|
||||
print('Creating column: mutation')
|
||||
|
||||
# dict to use: oneletter_aa_dict
|
||||
lookup_dict = dict()
|
||||
for k1, v1 in oneletter_aa_dict.items():
|
||||
lookup_dict[k1] = v1['three_letter_code_lower']
|
||||
for k2, v2 in lookup_dict.items():
|
||||
info_df2['wt_3let'] = info_df2['wild_type'].squeeze().map(lookup_dict)
|
||||
info_df2['mt_3let'] = info_df2['mutant_type'].squeeze().map(lookup_dict)
|
||||
|
||||
# create column mutation
|
||||
info_df2['mutation'] = info_df2['wt_3let'] + info_df2['position'] + info_df2['mt_3let']
|
||||
|
||||
# add prefix: gene_match to each value in column
|
||||
info_df2['mutation'] = gene_match.lower() + info_df2['mutation'].astype(str)
|
||||
|
||||
# sanity check
|
||||
ncols_add = 3 # Beware hardcoded
|
||||
if len(info_df2.columns) == info_df2_ncols + ncols_add:
|
||||
print('PASS: Succesfully created column mutation as below\n'
|
||||
, info_df2['mutation'].head()
|
||||
, '\n=====================================================================')
|
||||
else:
|
||||
print('FAIL: No. of cols mismatch\noriginal length:', info_df2_ncols
|
||||
, '\nExpected no. of cols:', info_df2_ncols + ncols_add
|
||||
, '\nGot no. of cols:', len(info_df2.columns))
|
||||
sys.exit()
|
||||
|
||||
#%% write file
|
||||
print('\n====================================================================='
|
||||
, '\nWriting output file:\n', outfile_snp_info
|
||||
, '\nNo.of rows:', len(info_df2)
|
||||
, '\nNo. of cols:', len(info_df2.columns))
|
||||
info_df2.to_csv(outfile_snp_info, index = False)
|
||||
|
Loading…
Add table
Add a link
Reference in a new issue