From f79aea254e4e673f83c19cdda05190fdc4cf4f58 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Fri, 18 Jun 2021 17:48:26 +0100 Subject: [PATCH] added function to add aa code for mcsm and gwas style mutations to a given files --- scripts/aa_code.py | 178 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 178 insertions(+) create mode 100644 scripts/aa_code.py diff --git a/scripts/aa_code.py b/scripts/aa_code.py new file mode 100644 index 0000000..0fad2a0 --- /dev/null +++ b/scripts/aa_code.py @@ -0,0 +1,178 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +''' +Created on Mon June 14 2021 + +@author: tanu +''' +# FIXME: import dirs.py to get the basic dir paths available +#======================================================================= +# TASK + +# Input: + +# Output: +#======================================================================= +#%% load libraries +import os, sys +import pandas as pd +import re +#import numpy as np +import argparse +DEBUG = False +#======================================================================= +#%% specify input and curr dir +homedir = os.path.expanduser('~') + +# set working dir +os.getcwd() +os.chdir(homedir + '/git/LSHTM_analysis/scripts') +os.getcwd() + +from reference_dict import oneletter_aa_dict +from reference_dict import low_3letter_dict +#======================================================================= +#%%########################################################################### +# FUNCTION: using mcsm mutation format to split mutation info into +# 2 separate columns for wt 3 letter lowecase and mut 3 letter lowercase +############################################################################### + +def get_aa_3lower(df, wt_colname = 'wild_type', mut_colname = 'mutant_type', col_wt = 'wt_aa_3lower', col_mut = 'mut_aa_3lower'): + + """ Add 3 letter lowercase aa code for wt and mutant residues specified as 1 letter uppercase aa code + + @df: df containing one letter aa code for wt and mutant respectively + @type: pandas df + + @wt_colname: column containing one letter wild type aa + @type: str + + @mut_colname: column containing one letter mutant type aa + @type: str + + @col_wt: column with 3 letter aa code lower for wild type aa + @type: str + + @col_mut: column with 3 letter aa code lower for mutant type aa + @type: str + + returns df: with 2 added columns. If column names clash, the function column + name will override original column + @rtype: pandas df + """ + + lookup_dict_aa_3lower = dict() + + for k, v in oneletter_aa_dict.items(): + + lookup_dict_aa_3lower[k] = v['three_letter_code_lower'] + #if DEBUG: + # print('Key:', k + # , 'Value:', v + # , '\n=====================================================\n' + # , '\nDICT:', lookup_dict_aa_3lower :\n') + + df[col_wt] = df[wt_colname].map(lookup_dict_aa_3lower) + df[col_mut] = df[mut_colname].map(lookup_dict_aa_3lower) + + return df +#%% +#================================== +# example: get_aa_3upper() +#================================== +# test_filename = '/home/tanu/git/Data/streptomycin/output/gid_complex_mcsm_norm_SAM.csv' +# test_df = pd.read_csv(test_filename , sep = ',') + +# my_wt_colname = 'wild_type' +# my_mut_colname = 'mutant_type' +# my_col1 = 'wt_aa_3lower' +# my_col2 = 'mut_aa_3lower' + +# get_aa_3lower(df = test_df +# , wt_colname = my_wt_colname +# , mut_colname = my_mut_colname +# , col_wt = my_col1 +# , col_mut = my_col2) +#%%########################################################################### +# FUNCTION: using gwas mutation format to split mutation info into +# 3 separate columns for wild type, position and mutation +############################################################################### +def get_aa_1upper(df + , gwas_mut_colname = 'mutation' + , wt_colname = 'wt_aa_1upper' + , pos_colname = 'position' + , mut_colname = 'mut_aa_1upper'): + + """Add 1 letter aa uppercase aa code for wt and mutant residues specified as 3 letter lowercase aa code + + @df: df containing one letter aa code for wt and mutant respectively + @type: pandas df + + @wt_regex: regex string matching three letter lowercase aa code + @type:regex + + @pos_regex: regex string matching aa position + @type:regex + + @mut_regex: regex string matching three letter lowercase aa code + @type: regex + + @wt_colname: column containing one letter wild type aa + @type: str + + @mut_colname: column containing one letter mutant type aa + @type: str + + @wt_colname: column with 3 letter aa code lower for wild type aa + @type: str + + @pos_colname: column with aa position + @type: int + + @mut_colname: column with 3 letter aa code lower for mutant type aa + @type: str + + returns df: with 3 added columns. If column names clash, the function column + name will override original column + @rtype: pandas df + """ + + # static regex + gwas_regex = r'^.*_p\.([A-Za-z]{3})([0-9]+)([A-Za-z]{3})$' + + gwas_wt = df[gwas_mut_colname].str.extract(gwas_regex)[0] + gwas_pos = df[gwas_mut_colname].str.extract(gwas_regex)[1] + gwas_mut = df[gwas_mut_colname].str.extract(gwas_regex)[2] + + lookup_dict_aa_1upper = dict() + for k, v in low_3letter_dict.items(): + + lookup_dict_aa_1upper[k] = v['one_letter_code'] + #if DEBUG: + # print('Key:', k + # , 'Value:', v + # , '\n======================================================\n' + # , '\nDICT:', lookup_dict_aa_1upper :\n') + + # wild type + df[wt_colname] = gwas_wt.map(lookup_dict_aa_1upper) + + # position + df[pos_colname] = gwas_pos + + # mutant type + df[mut_colname] = gwas_mut.map(lookup_dict_aa_1upper) + + return df +#%% +#================================== +# example: get_aa_1upper() +#================================== +# test_filename2 = '/home/tanu/git/Data/streptomycin/output/gid_af_or.csv' +# test_df2 = pd.read_csv(test_filename2 , sep = ',') + +# get_aa_1upper(df = test_df2 +# , gwas_mut_colname = 'mutation' +# , wt_colname = 'wild_type' +# , pos_colname = 'position' +# , mut_colname = 'mutant_type') \ No newline at end of file