LSHTM_analysis/scripts/nssnp_info_format.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Jun 10 11:13:49 2020

@author: tanu
"""
#==============================================================================
# TASK
# To format snp_fino.txt file that has already been processed in bash
# to include mcsm style muts and gwas style muts. The idea is that the info file
# will contain all possible mutation format style to make it easy to populate
# and link other files with this sort of meta data. For example: or file
#=======================================================================

# FIXME : add bash info here as well

#%% useful links
#https://chrisalbon.com/python/data_wrangling/pandas_join_merge_dataframe/
#https://kanoki.org/2019/11/12/how-to-use-regex-in-pandas/
#https://stackoverflow.com/questions/40348541/pandas-diff-with-string
#=======================================================================
#%% specify dirs
import os, sys
import pandas as pd
import numpy as np
import re
import argparse

homedir = os.path.expanduser('~')
os.chdir(homedir + '/git/LSHTM_analysis/scripts')

#from reference_dict import my_aa_dict
#from reference_dict import low_3letter_dict # equivalent of my_aa_dict
from reference_dict import oneletter_aa_dict
#=======================================================================
#%% command line args
arg_parser = argparse.ArgumentParser()

arg_parser.add_argument('-d', '--drug', help = 'drug name', default = 'pyrazinamide')
arg_parser.add_argument('-g', '--gene', help = 'gene name (case sensitive)', default = 'pncA') # case sensitive

args = arg_parser.parse_args()
#=======================================================================
#%% variables
#gene = 'pncA'
#drug = 'pyrazinamide'
#gene_match = gene +'_p.'

# cmd variables
gene = args.gene
drug = args.drug
gene_match = gene +'_p.'

#=======================================================================
#%% input and output dirs and files
#=======
# data dir
#=======
datadir = homedir + '/' + 'git/Data'
indir = datadir + '/' + drug + '/input'
outdir = datadir + '/' + drug + '/output'

#=======
# input
#=======

gene_info_filename  = 'ns'+ gene.lower()+ '_snp_info1.txt'
gene_info = indir + '/' + gene_info_filename
print('gene info file: ', gene_info
      , '\n============================================================')

#=======
# output
#=======
gene_snp_info_filename = 'ns' + gene.lower() + '_snp_info.csv' # other one is called AFandOR
outfile_snp_info =  indir + '/' + gene_snp_info_filename
print('Output file: ', outfile_snp_info
      , '\n============================================================')

#%% read files: preformatted using bash
info_df2  = pd.read_csv(gene_info, sep = '\t', header = 0) #303, 10

#%% Split into three cols with 1-letter aa_code & combine to get mutationinformation column
# check mutation format in exisiting df
info_df2.head()
info_df2['mut_info'].head()

print('Creating column: mutationinformation')
info_df2_ncols = len(info_df2.columns)

info_df2['wild_type'] = info_df2['mut_info'].str.extract('(\w{1})>')
info_df2['position'] = info_df2['mut_info'].str.extract('(\d+)')
info_df2['mutant_type'] = info_df2['mut_info'].str.extract('>\d+(\w{1})')

info_df2['mutationinformation'] = info_df2['wild_type'] + info_df2['position'] + info_df2['mutant_type']

# sanity check
ncols_add = 4 # Beware hardcoded
if len(info_df2.columns) == info_df2_ncols + ncols_add:
    print('PASS: Succesfully extracted and added mutationinformation (mcsm style) as below\n'
          , info_df2['mutationinformation'].head()
          , '\n=====================================================================')
else:
    print('FAIL: No. of cols mismatch'
          ,'\noriginal length:', info_df2_ncols
          , '\nExpected no. of cols:', info_df2_ncols + ncols_add
          , '\nGot no. of cols:', len(info_df2.columns))
    sys.exit()

# update ncols
info_df2_ncols = len(info_df2.columns)

#%%  Creating column 'mutation' which as mutation of the format;
# <gene_match>.lower()<abc>1<cde>: pnca_p.trp68gly
# match the 'one_letter_code' value to get the dict key (three-letter code)

print('Creating column: mutation')

# dict to use: oneletter_aa_dict
lookup_dict = dict()
for k1, v1 in oneletter_aa_dict.items():
    lookup_dict[k1] = v1['three_letter_code_lower']
for k2, v2 in lookup_dict.items():
    info_df2['wt_3let'] = info_df2['wild_type'].squeeze().map(lookup_dict)
    info_df2['mt_3let'] = info_df2['mutant_type'].squeeze().map(lookup_dict)

# create column mutation
info_df2['mutation'] = info_df2['wt_3let'] + info_df2['position'] + info_df2['mt_3let']

# add prefix: gene_match to each value in column
info_df2['mutation'] = gene_match.lower() + info_df2['mutation'].astype(str)

# sanity check
ncols_add = 3 # Beware hardcoded
if len(info_df2.columns) == info_df2_ncols + ncols_add:
    print('PASS: Succesfully created column mutation as below\n'
          , info_df2['mutation'].head()
          , '\n=====================================================================')
else:
    print('FAIL: No. of cols mismatch\noriginal length:', info_df2_ncols
          , '\nExpected no. of cols:', info_df2_ncols + ncols_add
          , '\nGot no. of cols:', len(info_df2.columns))
    sys.exit()

#%% write file
print('\n====================================================================='
      , '\nWriting output file:\n', outfile_snp_info
      , '\nNo.of rows:', len(info_df2)
      , '\nNo. of cols:', len(info_df2.columns))
info_df2.to_csv(outfile_snp_info, index = False)